Use lru_cache for tokenization tests (#36818)

* fix

* fix

* fix

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
Yih-Dar 2025-03-28 15:09:35 +01:00 committed by GitHub
parent 3af425d4c6
commit 1fcaad6df9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
92 changed files with 1301 additions and 884 deletions

View File

@ -34,12 +34,13 @@ class AlbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_sentencepiece = True test_sentencepiece = True
test_sentencepiece_ignore_case = True test_sentencepiece_ignore_case = True
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
# We have a SentencePiece fixture for testing # We have a SentencePiece fixture for testing
tokenizer = AlbertTokenizer(SAMPLE_VOCAB) tokenizer = AlbertTokenizer(SAMPLE_VOCAB)
tokenizer.save_pretrained(self.tmpdirname) tokenizer.save_pretrained(cls.tmpdirname)
def get_input_output_texts(self, tokenizer): def get_input_output_texts(self, tokenizer):
input_text = "this is a test" input_text = "this is a test"

View File

@ -14,13 +14,14 @@
import json import json
import os import os
import unittest import unittest
from functools import lru_cache
from transformers import BartTokenizer, BartTokenizerFast, BatchEncoding from transformers import BartTokenizer, BartTokenizerFast, BatchEncoding
from transformers.models.roberta.tokenization_roberta import VOCAB_FILES_NAMES from transformers.models.roberta.tokenization_roberta import VOCAB_FILES_NAMES
from transformers.testing_utils import require_tokenizers, require_torch from transformers.testing_utils import require_tokenizers, require_torch
from transformers.utils import cached_property from transformers.utils import cached_property
from ...test_tokenization_common import TokenizerTesterMixin, filter_roberta_detectors from ...test_tokenization_common import TokenizerTesterMixin, filter_roberta_detectors, use_cache_if_possible
@require_tokenizers @require_tokenizers
@ -32,8 +33,10 @@ class TestTokenizationBart(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_filter = filter_roberta_detectors from_pretrained_filter = filter_roberta_detectors
# from_pretrained_kwargs = {'add_prefix_space': True} # from_pretrained_kwargs = {'add_prefix_space': True}
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
vocab = [ vocab = [
"l", "l",
"o", "o",
@ -58,22 +61,30 @@ class TestTokenizationBart(TokenizerTesterMixin, unittest.TestCase):
] ]
vocab_tokens = dict(zip(vocab, range(len(vocab)))) vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""] merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
self.special_tokens_map = {"unk_token": "<unk>"} cls.special_tokens_map = {"unk_token": "<unk>"}
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(self.vocab_file, "w", encoding="utf-8") as fp: with open(cls.vocab_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(vocab_tokens) + "\n") fp.write(json.dumps(vocab_tokens) + "\n")
with open(self.merges_file, "w", encoding="utf-8") as fp: with open(cls.merges_file, "w", encoding="utf-8") as fp:
fp.write("\n".join(merges)) fp.write("\n".join(merges))
def get_tokenizer(self, **kwargs): @classmethod
kwargs.update(self.special_tokens_map) @use_cache_if_possible
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) @lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
def get_rust_tokenizer(self, **kwargs): @classmethod
kwargs.update(self.special_tokens_map) @use_cache_if_possible
return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) @lru_cache(maxsize=64)
def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return cls.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
def get_input_output_texts(self, tokenizer): def get_input_output_texts(self, tokenizer):
return "lower newer", "lower newer" return "lower newer", "lower newer"
@ -154,8 +165,8 @@ class TestTokenizationBart(TokenizerTesterMixin, unittest.TestCase):
def test_embeded_special_tokens(self): def test_embeded_special_tokens(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
sentence = "A, <mask> AllenNLP sentence." sentence = "A, <mask> AllenNLP sentence."
tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True) tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True) tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)

View File

@ -31,13 +31,14 @@ class BarthezTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = True test_rust_tokenizer = True
test_sentencepiece = True test_sentencepiece = True
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
tokenizer = BarthezTokenizerFast.from_pretrained("moussaKam/mbarthez") tokenizer = BarthezTokenizerFast.from_pretrained("moussaKam/mbarthez")
tokenizer.save_pretrained(self.tmpdirname) tokenizer.save_pretrained(cls.tmpdirname)
tokenizer.save_pretrained(self.tmpdirname, legacy_format=False) tokenizer.save_pretrained(cls.tmpdirname, legacy_format=False)
self.tokenizer = tokenizer cls.tokenizer = tokenizer
def test_convert_token_and_id(self): def test_convert_token_and_id(self):
"""Test ``_convert_token_to_id`` and ``_convert_id_to_token``.""" """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""

View File

@ -15,11 +15,12 @@
import os import os
import unittest import unittest
from functools import lru_cache
from transformers.models.bartpho.tokenization_bartpho import VOCAB_FILES_NAMES, BartphoTokenizer from transformers.models.bartpho.tokenization_bartpho import VOCAB_FILES_NAMES, BartphoTokenizer
from transformers.testing_utils import get_tests_dir from transformers.testing_utils import get_tests_dir
from ...test_tokenization_common import TokenizerTesterMixin from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece_bpe.model") SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece_bpe.model")
@ -31,24 +32,29 @@ class BartphoTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = False test_rust_tokenizer = False
test_sentencepiece = True test_sentencepiece = True
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
vocab = ["▁This", "▁is", "▁a", "▁t", "est"] vocab = ["▁This", "▁is", "▁a", "▁t", "est"]
vocab_tokens = dict(zip(vocab, range(len(vocab)))) vocab_tokens = dict(zip(vocab, range(len(vocab))))
self.special_tokens_map = {"unk_token": "<unk>"} cls.special_tokens_map = {"unk_token": "<unk>"}
self.monolingual_vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["monolingual_vocab_file"]) cls.monolingual_vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["monolingual_vocab_file"])
with open(self.monolingual_vocab_file, "w", encoding="utf-8") as fp: with open(cls.monolingual_vocab_file, "w", encoding="utf-8") as fp:
for token in vocab_tokens: for token in vocab_tokens:
fp.write(f"{token} {vocab_tokens[token]}\n") fp.write(f"{token} {vocab_tokens[token]}\n")
tokenizer = BartphoTokenizer(SAMPLE_VOCAB, self.monolingual_vocab_file, **self.special_tokens_map) tokenizer = BartphoTokenizer(SAMPLE_VOCAB, cls.monolingual_vocab_file, **cls.special_tokens_map)
tokenizer.save_pretrained(self.tmpdirname) tokenizer.save_pretrained(cls.tmpdirname)
def get_tokenizer(self, **kwargs): @classmethod
kwargs.update(self.special_tokens_map) @use_cache_if_possible
return BartphoTokenizer.from_pretrained(self.tmpdirname, **kwargs) @lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return BartphoTokenizer.from_pretrained(pretrained_name, **kwargs)
def get_input_output_texts(self, tokenizer): def get_input_output_texts(self, tokenizer):
input_text = "This is a là test" input_text = "This is a là test"

View File

@ -41,8 +41,9 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
space_between_special_tokens = True space_between_special_tokens = True
from_pretrained_filter = filter_non_english from_pretrained_filter = filter_non_english
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
vocab_tokens = [ vocab_tokens = [
"[UNK]", "[UNK]",
@ -61,8 +62,8 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
"low", "low",
"lowest", "lowest",
] ]
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
def get_input_output_texts(self, tokenizer): def get_input_output_texts(self, tokenizer):
@ -257,7 +258,7 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_offsets_with_special_characters(self): def test_offsets_with_special_characters(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence." sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence."
tokens = tokenizer_r.encode_plus( tokens = tokenizer_r.encode_plus(
@ -312,8 +313,8 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
kwargs["tokenize_chinese_chars"] = True kwargs["tokenize_chinese_chars"] = True
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False) ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)
ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False) ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
@ -326,8 +327,8 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertListEqual(tokens_without_spe_char_r, list_of_commun_chinese_char) self.assertListEqual(tokens_without_spe_char_r, list_of_commun_chinese_char)
kwargs["tokenize_chinese_chars"] = False kwargs["tokenize_chinese_chars"] = False
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False) ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False) ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)

View File

@ -34,11 +34,12 @@ class BertGenerationTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = False test_rust_tokenizer = False
test_sentencepiece = True test_sentencepiece = True
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
tokenizer = BertGenerationTokenizer(SAMPLE_VOCAB, keep_accents=True) tokenizer = BertGenerationTokenizer(SAMPLE_VOCAB, keep_accents=True)
tokenizer.save_pretrained(self.tmpdirname) tokenizer.save_pretrained(cls.tmpdirname)
def test_convert_token_and_id(self): def test_convert_token_and_id(self):
"""Test ``_convert_token_to_id`` and ``_convert_id_to_token``.""" """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""

View File

@ -17,6 +17,7 @@
import os import os
import pickle import pickle
import unittest import unittest
from functools import lru_cache
from transformers import AutoTokenizer from transformers import AutoTokenizer
from transformers.models.bert.tokenization_bert import BertTokenizer from transformers.models.bert.tokenization_bert import BertTokenizer
@ -31,7 +32,7 @@ from transformers.models.bert_japanese.tokenization_bert_japanese import (
) )
from transformers.testing_utils import custom_tokenizers, require_jumanpp, require_sudachi_projection from transformers.testing_utils import custom_tokenizers, require_jumanpp, require_sudachi_projection
from ...test_tokenization_common import TokenizerTesterMixin from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
@custom_tokenizers @custom_tokenizers
@ -41,8 +42,9 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = False test_rust_tokenizer = False
space_between_special_tokens = True space_between_special_tokens = True
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
vocab_tokens = [ vocab_tokens = [
"[UNK]", "[UNK]",
@ -72,8 +74,8 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
"です", "です",
] ]
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
def get_input_output_texts(self, tokenizer): def get_input_output_texts(self, tokenizer):
@ -408,17 +410,21 @@ class BertJapaneseCharacterTokenizationTest(TokenizerTesterMixin, unittest.TestC
tokenizer_class = BertJapaneseTokenizer tokenizer_class = BertJapaneseTokenizer
test_rust_tokenizer = False test_rust_tokenizer = False
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "", "", "", "", "", "", "", "", "", ""] vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "", "", "", "", "", "", "", "", "", ""]
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
def get_tokenizer(self, **kwargs): @classmethod
return BertJapaneseTokenizer.from_pretrained(self.tmpdirname, subword_tokenizer_type="character", **kwargs) @use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs):
return BertJapaneseTokenizer.from_pretrained(cls.tmpdirname, subword_tokenizer_type="character", **kwargs)
def get_input_output_texts(self, tokenizer): def get_input_output_texts(self, tokenizer):
input_text = "こんにちは、世界。 \nこんばんは、世界。" input_text = "こんにちは、世界。 \nこんばんは、世界。"

View File

@ -15,10 +15,11 @@
import os import os
import unittest import unittest
from functools import lru_cache
from transformers.models.bertweet.tokenization_bertweet import VOCAB_FILES_NAMES, BertweetTokenizer from transformers.models.bertweet.tokenization_bertweet import VOCAB_FILES_NAMES, BertweetTokenizer
from ...test_tokenization_common import TokenizerTesterMixin from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
class BertweetTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class BertweetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
@ -26,26 +27,31 @@ class BertweetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = BertweetTokenizer tokenizer_class = BertweetTokenizer
test_rust_tokenizer = False test_rust_tokenizer = False
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
vocab = ["I", "m", "V@@", "R@@", "r", "e@@"] vocab = ["I", "m", "V@@", "R@@", "r", "e@@"]
vocab_tokens = dict(zip(vocab, range(len(vocab)))) vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["#version: 0.2", "a m</w>"] merges = ["#version: 0.2", "a m</w>"]
self.special_tokens_map = {"unk_token": "<unk>"} cls.special_tokens_map = {"unk_token": "<unk>"}
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(self.vocab_file, "w", encoding="utf-8") as fp: with open(cls.vocab_file, "w", encoding="utf-8") as fp:
for token in vocab_tokens: for token in vocab_tokens:
fp.write(f"{token} {vocab_tokens[token]}\n") fp.write(f"{token} {vocab_tokens[token]}\n")
with open(self.merges_file, "w", encoding="utf-8") as fp: with open(cls.merges_file, "w", encoding="utf-8") as fp:
fp.write("\n".join(merges)) fp.write("\n".join(merges))
def get_tokenizer(self, **kwargs): @classmethod
kwargs.update(self.special_tokens_map) @use_cache_if_possible
return BertweetTokenizer.from_pretrained(self.tmpdirname, **kwargs) @lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return BertweetTokenizer.from_pretrained(pretrained_name, **kwargs)
def get_input_output_texts(self, tokenizer): def get_input_output_texts(self, tokenizer):
input_text = "I am VinAI Research" input_text = "I am VinAI Research"

View File

@ -36,11 +36,12 @@ class BigBirdTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = True test_rust_tokenizer = True
test_sentencepiece = True test_sentencepiece = True
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
tokenizer = self.tokenizer_class(SAMPLE_VOCAB, keep_accents=True) tokenizer = cls.tokenizer_class(SAMPLE_VOCAB, keep_accents=True)
tokenizer.save_pretrained(self.tmpdirname) tokenizer.save_pretrained(cls.tmpdirname)
def test_convert_token_and_id(self): def test_convert_token_and_id(self):
"""Test ``_convert_token_to_id`` and ``_convert_id_to_token``.""" """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""

View File

@ -30,8 +30,9 @@ class BioGptTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = BioGptTokenizer tokenizer_class = BioGptTokenizer
test_rust_tokenizer = False test_rust_tokenizer = False
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
vocab = [ vocab = [
@ -60,11 +61,11 @@ class BioGptTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
vocab_tokens = dict(zip(vocab, range(len(vocab)))) vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""] merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""]
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(self.vocab_file, "w") as fp: with open(cls.vocab_file, "w") as fp:
fp.write(json.dumps(vocab_tokens)) fp.write(json.dumps(vocab_tokens))
with open(self.merges_file, "w") as fp: with open(cls.merges_file, "w") as fp:
fp.write("\n".join(merges)) fp.write("\n".join(merges))
def get_input_output_texts(self, tokenizer): def get_input_output_texts(self, tokenizer):

View File

@ -18,13 +18,14 @@
import json import json
import os import os
import unittest import unittest
from functools import lru_cache
from transformers.models.blenderbot_small.tokenization_blenderbot_small import ( from transformers.models.blenderbot_small.tokenization_blenderbot_small import (
VOCAB_FILES_NAMES, VOCAB_FILES_NAMES,
BlenderbotSmallTokenizer, BlenderbotSmallTokenizer,
) )
from ...test_tokenization_common import TokenizerTesterMixin from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
class BlenderbotSmallTokenizerTest(TokenizerTesterMixin, unittest.TestCase): class BlenderbotSmallTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
@ -32,25 +33,30 @@ class BlenderbotSmallTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = BlenderbotSmallTokenizer tokenizer_class = BlenderbotSmallTokenizer
test_rust_tokenizer = False test_rust_tokenizer = False
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
vocab = ["__start__", "adapt", "act", "ap@@", "te", "__end__", "__unk__"] vocab = ["__start__", "adapt", "act", "ap@@", "te", "__end__", "__unk__"]
vocab_tokens = dict(zip(vocab, range(len(vocab)))) vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["#version: 0.2", "a p", "t e</w>", "ap t</w>", "a d", "ad apt</w>", "a c", "ac t</w>", ""] merges = ["#version: 0.2", "a p", "t e</w>", "ap t</w>", "a d", "ad apt</w>", "a c", "ac t</w>", ""]
self.special_tokens_map = {"unk_token": "__unk__", "bos_token": "__start__", "eos_token": "__end__"} cls.special_tokens_map = {"unk_token": "__unk__", "bos_token": "__start__", "eos_token": "__end__"}
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(self.vocab_file, "w", encoding="utf-8") as fp: with open(cls.vocab_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(vocab_tokens) + "\n") fp.write(json.dumps(vocab_tokens) + "\n")
with open(self.merges_file, "w", encoding="utf-8") as fp: with open(cls.merges_file, "w", encoding="utf-8") as fp:
fp.write("\n".join(merges)) fp.write("\n".join(merges))
def get_tokenizer(self, **kwargs): @classmethod
kwargs.update(self.special_tokens_map) @use_cache_if_possible
return BlenderbotSmallTokenizer.from_pretrained(self.tmpdirname, **kwargs) @lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return BlenderbotSmallTokenizer.from_pretrained(pretrained_name, **kwargs)
def get_input_output_texts(self, tokenizer): def get_input_output_texts(self, tokenizer):
input_text = "adapt act apte" input_text = "adapt act apte"

View File

@ -13,14 +13,16 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import copy
import unittest import unittest
from functools import lru_cache
from datasets import load_dataset from datasets import load_dataset
from transformers import BloomTokenizerFast from transformers import BloomTokenizerFast
from transformers.testing_utils import require_jinja, require_tokenizers from transformers.testing_utils import require_jinja, require_tokenizers
from ...test_tokenization_common import TokenizerTesterMixin from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
@require_tokenizers @require_tokenizers
@ -34,14 +36,21 @@ class BloomTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_vocab_key = "tokenizer_file" from_pretrained_vocab_key = "tokenizer_file"
special_tokens_map = {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"} special_tokens_map = {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
tokenizer = BloomTokenizerFast.from_pretrained("bigscience/tokenizer") tokenizer = BloomTokenizerFast.from_pretrained("bigscience/tokenizer")
tokenizer.save_pretrained(self.tmpdirname) tokenizer.save_pretrained(cls.tmpdirname)
def get_rust_tokenizer(self, **kwargs): @classmethod
kwargs.update(self.special_tokens_map) @use_cache_if_possible
return BloomTokenizerFast.from_pretrained(self.tmpdirname, **kwargs) @lru_cache(maxsize=64)
def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
_kwargs = copy.deepcopy(cls.special_tokens_map)
_kwargs.update(kwargs)
kwargs = _kwargs
pretrained_name = pretrained_name or cls.tmpdirname
return BloomTokenizerFast.from_pretrained(pretrained_name, **kwargs)
@unittest.skip(reason="This needs a slow tokenizer. Bloom does not have one!") @unittest.skip(reason="This needs a slow tokenizer. Bloom does not have one!")
def test_encode_decode_with_spaces(self): def test_encode_decode_with_spaces(self):
@ -65,7 +74,7 @@ class BloomTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_padding(self, max_length=6): def test_padding(self, max_length=6):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
# tokenizer_r.pad_token = None # Hotfixing padding = None # tokenizer_r.pad_token = None # Hotfixing padding = None
# Simple input # Simple input
s = "This is a simple input" s = "This is a simple input"

View File

@ -19,12 +19,13 @@ import re
import shutil import shutil
import tempfile import tempfile
import unittest import unittest
from functools import lru_cache
from typing import Tuple from typing import Tuple
from transformers import AddedToken, BatchEncoding, ByT5Tokenizer from transformers import AddedToken, BatchEncoding, ByT5Tokenizer
from transformers.utils import cached_property, is_tf_available, is_torch_available from transformers.utils import cached_property, is_tf_available, is_torch_available
from ...test_tokenization_common import TokenizerTesterMixin from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
if is_torch_available(): if is_torch_available():
@ -39,17 +40,22 @@ class ByT5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = ByT5Tokenizer tokenizer_class = ByT5Tokenizer
test_rust_tokenizer = False test_rust_tokenizer = False
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
tokenizer = ByT5Tokenizer() tokenizer = ByT5Tokenizer()
tokenizer.save_pretrained(self.tmpdirname) tokenizer.save_pretrained(cls.tmpdirname)
@cached_property @cached_property
def t5_base_tokenizer(self): def t5_base_tokenizer(self):
return ByT5Tokenizer.from_pretrained("google/byt5-small") return ByT5Tokenizer.from_pretrained("google/byt5-small")
def get_tokenizer(self, **kwargs) -> ByT5Tokenizer: @classmethod
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) @use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs) -> ByT5Tokenizer:
pretrained_name = pretrained_name or cls.tmpdirname
return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
def get_clean_sequence(self, tokenizer, with_prefix_space=False, max_length=20, min_length=5) -> Tuple[str, list]: def get_clean_sequence(self, tokenizer, with_prefix_space=False, max_length=20, min_length=5) -> Tuple[str, list]:
# XXX The default common tokenizer tests assume that every ID is decodable on its own. # XXX The default common tokenizer tests assume that every ID is decodable on its own.

View File

@ -15,6 +15,7 @@
import tempfile import tempfile
import unittest import unittest
from tempfile import TemporaryDirectory
from transformers import AddedToken, CamembertTokenizer, CamembertTokenizerFast from transformers import AddedToken, CamembertTokenizer, CamembertTokenizerFast
from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, slow from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, slow
@ -38,12 +39,13 @@ class CamembertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = True test_rust_tokenizer = True
test_sentencepiece = True test_sentencepiece = True
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
# We have a SentencePiece fixture for testing # We have a SentencePiece fixture for testing
tokenizer = CamembertTokenizer(SAMPLE_VOCAB) tokenizer = CamembertTokenizer(SAMPLE_VOCAB)
tokenizer.save_pretrained(self.tmpdirname) tokenizer.save_pretrained(cls.tmpdirname)
@unittest.skip( @unittest.skip(
"Token maps are not equal because someone set the probability of ('<unk>NOTUSED', -100), so it's never encoded for fast" "Token maps are not equal because someone set the probability of ('<unk>NOTUSED', -100), so it's never encoded for fast"
@ -72,8 +74,9 @@ class CamembertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_rust_and_python_bpe_tokenizers(self): def test_rust_and_python_bpe_tokenizers(self):
tokenizer = CamembertTokenizer(SAMPLE_BPE_VOCAB) tokenizer = CamembertTokenizer(SAMPLE_BPE_VOCAB)
tokenizer.save_pretrained(self.tmpdirname) with TemporaryDirectory() as tmpdirname:
rust_tokenizer = CamembertTokenizerFast.from_pretrained(self.tmpdirname) tokenizer.save_pretrained(tmpdirname)
rust_tokenizer = CamembertTokenizerFast.from_pretrained(tmpdirname)
sequence = "I was born in 92000, and this is falsé." sequence = "I was born in 92000, and this is falsé."
@ -147,11 +150,11 @@ class CamembertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertTrue(all(item in tokenizer.added_tokens_decoder.items() for item in expected.items())) self.assertTrue(all(item in tokenizer.added_tokens_decoder.items() for item in expected.items()))
return tokenizer return tokenizer
new_eos = AddedToken("[NEW_EOS]", rstrip=False, lstrip=True, normalized=False) new_eos = AddedToken("[NEW_EOS]", rstrip=False, lstrip=True, normalized=False, special=True)
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
# Load a slow tokenizer from the hub, init with the new token for fast to also include it # Load a slow tokenizer from the hub, init with the new token for fast to also include it
tokenizer = self.tokenizer_class.from_pretrained(pretrained_name, eos_token=new_eos) tokenizer = self.get_tokenizer(pretrained_name, eos_token=new_eos)
EXPECTED_ADDED_TOKENS_DECODER = tokenizer.added_tokens_decoder EXPECTED_ADDED_TOKENS_DECODER = tokenizer.added_tokens_decoder
with self.subTest("Hub -> Slow: Test loading a slow tokenizer from the hub)"): with self.subTest("Hub -> Slow: Test loading a slow tokenizer from the hub)"):
self.assertEqual(tokenizer._special_tokens_map["eos_token"], new_eos) self.assertEqual(tokenizer._special_tokens_map["eos_token"], new_eos)
@ -191,9 +194,7 @@ class CamembertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
with self.subTest("Hub -> Fast: Test loading a fast tokenizer from the hub)"): with self.subTest("Hub -> Fast: Test loading a fast tokenizer from the hub)"):
if self.rust_tokenizer_class is not None: if self.rust_tokenizer_class is not None:
tokenizer_fast = self.rust_tokenizer_class.from_pretrained( tokenizer_fast = self.get_rust_tokenizer(pretrained_name, eos_token=new_eos, from_slow=True)
pretrained_name, eos_token=new_eos, from_slow=True
)
self.assertEqual(tokenizer_fast._special_tokens_map["eos_token"], new_eos) self.assertEqual(tokenizer_fast._special_tokens_map["eos_token"], new_eos)
self.assertIn(new_eos, list(tokenizer_fast.added_tokens_decoder.values())) self.assertIn(new_eos, list(tokenizer_fast.added_tokens_decoder.values()))
# We can't test the following because for BC we kept the default rstrip lstrip in slow not fast. Will comment once normalization is alright # We can't test the following because for BC we kept the default rstrip lstrip in slow not fast. Will comment once normalization is alright

View File

@ -18,13 +18,14 @@ import os
import shutil import shutil
import tempfile import tempfile
import unittest import unittest
from functools import lru_cache
from transformers import BatchEncoding, CanineTokenizer from transformers import BatchEncoding, CanineTokenizer
from transformers.testing_utils import require_tokenizers, require_torch from transformers.testing_utils import require_tokenizers, require_torch
from transformers.tokenization_utils import AddedToken from transformers.tokenization_utils import AddedToken
from transformers.utils import cached_property from transformers.utils import cached_property
from ...test_tokenization_common import TokenizerTesterMixin from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
class CanineTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class CanineTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
@ -32,17 +33,22 @@ class CanineTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = CanineTokenizer tokenizer_class = CanineTokenizer
test_rust_tokenizer = False test_rust_tokenizer = False
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
tokenizer = CanineTokenizer() tokenizer = CanineTokenizer()
tokenizer.save_pretrained(self.tmpdirname) tokenizer.save_pretrained(cls.tmpdirname)
@cached_property @cached_property
def canine_tokenizer(self): def canine_tokenizer(self):
return CanineTokenizer.from_pretrained("google/canine-s") return CanineTokenizer.from_pretrained("google/canine-s")
def get_tokenizer(self, **kwargs) -> CanineTokenizer: @classmethod
tokenizer = self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) @use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs) -> CanineTokenizer:
pretrained_name = pretrained_name or cls.tmpdirname
tokenizer = cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer._unicode_vocab_size = 1024 tokenizer._unicode_vocab_size = 1024
return tokenizer return tokenizer

View File

@ -17,12 +17,13 @@
import json import json
import os import os
import unittest import unittest
from functools import lru_cache
from transformers import CLIPTokenizer, CLIPTokenizerFast from transformers import CLIPTokenizer, CLIPTokenizerFast
from transformers.models.clip.tokenization_clip import VOCAB_FILES_NAMES from transformers.models.clip.tokenization_clip import VOCAB_FILES_NAMES
from transformers.testing_utils import require_ftfy, require_tokenizers from transformers.testing_utils import require_ftfy, require_tokenizers
from ...test_tokenization_common import TokenizerTesterMixin from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
@require_tokenizers @require_tokenizers
@ -34,28 +35,37 @@ class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_kwargs = {} from_pretrained_kwargs = {}
test_seq2seq = False test_seq2seq = False
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "lo", "l</w>", "w</w>", "r</w>", "t</w>", "low</w>", "er</w>", "lowest</w>", "newer</w>", "wider", "<unk>", "<|startoftext|>", "<|endoftext|>"] # fmt: skip vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "lo", "l</w>", "w</w>", "r</w>", "t</w>", "low</w>", "er</w>", "lowest</w>", "newer</w>", "wider", "<unk>", "<|startoftext|>", "<|endoftext|>"] # fmt: skip
vocab_tokens = dict(zip(vocab, range(len(vocab)))) vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["#version: 0.2", "l o", "lo w</w>", "e r</w>"] merges = ["#version: 0.2", "l o", "lo w</w>", "e r</w>"]
self.special_tokens_map = {"unk_token": "<unk>"} cls.special_tokens_map = {"unk_token": "<unk>"}
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(self.vocab_file, "w", encoding="utf-8") as fp: with open(cls.vocab_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(vocab_tokens) + "\n") fp.write(json.dumps(vocab_tokens) + "\n")
with open(self.merges_file, "w", encoding="utf-8") as fp: with open(cls.merges_file, "w", encoding="utf-8") as fp:
fp.write("\n".join(merges)) fp.write("\n".join(merges))
def get_tokenizer(self, **kwargs): @classmethod
kwargs.update(self.special_tokens_map) @use_cache_if_possible
return CLIPTokenizer.from_pretrained(self.tmpdirname, **kwargs) @lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return CLIPTokenizer.from_pretrained(pretrained_name, **kwargs)
def get_rust_tokenizer(self, **kwargs): @classmethod
kwargs.update(self.special_tokens_map) @use_cache_if_possible
return CLIPTokenizerFast.from_pretrained(self.tmpdirname, **kwargs) @lru_cache(maxsize=64)
def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return CLIPTokenizerFast.from_pretrained(pretrained_name, **kwargs)
def get_input_output_texts(self, tokenizer): def get_input_output_texts(self, tokenizer):
input_text = "lower newer" input_text = "lower newer"
@ -77,8 +87,8 @@ class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_check_encoding_slow_fast(self): def test_check_encoding_slow_fast(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_s = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_s = self.get_tokenizer(pretrained_name, **kwargs)
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
text = "A\n'll 11p223RF☆ho!!to?'d'd''d of a cat to-$''d." text = "A\n'll 11p223RF☆ho!!to?'d'd''d of a cat to-$''d."
text_tokenized_s = tokenizer_s.tokenize(text) text_tokenized_s = tokenizer_s.tokenize(text)
@ -138,7 +148,7 @@ class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
text_of_1_token = "hello" # `hello` is a token in the vocabulary of `pretrained_name` text_of_1_token = "hello" # `hello` is a token in the vocabulary of `pretrained_name`
text = f"{text_of_1_token} {text_of_1_token}" text = f"{text_of_1_token} {text_of_1_token}"
tokenizer_r = self.rust_tokenizer_class.from_pretrained( tokenizer_r = self.get_rust_tokenizer(
pretrained_name, pretrained_name,
use_fast=True, use_fast=True,
) )
@ -151,7 +161,7 @@ class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
text = f" {text}" text = f" {text}"
tokenizer_r = self.rust_tokenizer_class.from_pretrained( tokenizer_r = self.get_rust_tokenizer(
pretrained_name, pretrained_name,
use_fast=True, use_fast=True,
) )
@ -166,7 +176,7 @@ class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
# Test related to the breaking change introduced in transformers v4.17.0 # Test related to the breaking change introduced in transformers v4.17.0
# We need to check that an error in raised when the user try to load a previous version of the tokenizer. # We need to check that an error in raised when the user try to load a previous version of the tokenizer.
with self.assertRaises(ValueError) as context: with self.assertRaises(ValueError) as context:
self.rust_tokenizer_class.from_pretrained("robot-test/old-clip-tokenizer") self.get_rust_tokenizer("robot-test/old-clip-tokenizer")
self.assertTrue( self.assertTrue(
context.exception.args[0].startswith( context.exception.args[0].startswith(

View File

@ -17,11 +17,12 @@
import json import json
import os import os
import unittest import unittest
from functools import lru_cache
from typing import List from typing import List
from transformers import ClvpTokenizer from transformers import ClvpTokenizer
from ...test_tokenization_common import TokenizerTesterMixin, slow from ...test_tokenization_common import TokenizerTesterMixin, slow, use_cache_if_possible
class ClvpTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class ClvpTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
@ -32,8 +33,9 @@ class ClvpTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_seq2seq = False test_seq2seq = False
test_sentencepiece_ignore_case = True test_sentencepiece_ignore_case = True
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
vocab = [ vocab = [
@ -62,19 +64,23 @@ class ClvpTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
] ]
vocab_tokens = dict(zip(vocab, range(len(vocab)))) vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""] merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
self.special_tokens_map = {"unk_token": "<unk>"} cls.special_tokens_map = {"unk_token": "<unk>"}
self.vocab_file = os.path.join(self.tmpdirname, "vocab.json") cls.vocab_file = os.path.join(cls.tmpdirname, "vocab.json")
self.merges_file = os.path.join(self.tmpdirname, "merges.txt") cls.merges_file = os.path.join(cls.tmpdirname, "merges.txt")
with open(self.vocab_file, "w", encoding="utf-8") as fp: with open(cls.vocab_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(vocab_tokens) + "\n") fp.write(json.dumps(vocab_tokens) + "\n")
with open(self.merges_file, "w", encoding="utf-8") as fp: with open(cls.merges_file, "w", encoding="utf-8") as fp:
fp.write("\n".join(merges)) fp.write("\n".join(merges))
# Copied from transformers.tests.models.gpt2.test_tokenization_gpt2.GPT2TokenizationTest.get_tokenizer with GPT2->Clvp # Copied from transformers.tests.models.gpt2.test_tokenization_gpt2.GPT2TokenizationTest.get_tokenizer with GPT2->Clvp
def get_tokenizer(self, **kwargs): @classmethod
kwargs.update(self.special_tokens_map) @use_cache_if_possible
return ClvpTokenizer.from_pretrained(self.tmpdirname, **kwargs) @lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return ClvpTokenizer.from_pretrained(pretrained_name, **kwargs)
# Copied from transformers.tests.models.gpt2.test_tokenization_gpt2.GPT2TokenizationTest.get_input_output_texts # Copied from transformers.tests.models.gpt2.test_tokenization_gpt2.GPT2TokenizationTest.get_input_output_texts
def get_input_output_texts(self, tokenizer): def get_input_output_texts(self, tokenizer):
@ -134,7 +140,7 @@ class ClvpTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_padding(self, max_length=15): def test_padding(self, max_length=15):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
# Simple input # Simple input
s = "This is a simple input" s = "This is a simple input"

View File

@ -53,15 +53,16 @@ class CodeLlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_sentencepiece = True test_sentencepiece = True
from_pretrained_kwargs = {} from_pretrained_kwargs = {}
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
# We have a SentencePiece fixture for testing # We have a SentencePiece fixture for testing
tokenizer = CodeLlamaTokenizer(SAMPLE_VOCAB, keep_accents=True) tokenizer = CodeLlamaTokenizer(SAMPLE_VOCAB, keep_accents=True)
tokenizer.pad_token = tokenizer.eos_token tokenizer.pad_token = tokenizer.eos_token
tokenizer.save_pretrained(self.tmpdirname) tokenizer.save_pretrained(cls.tmpdirname)
def get_tokenizers(self, **kwargs): def get_tokenizers(cls, **kwargs):
kwargs.update({"pad_token": "<PAD>"}) kwargs.update({"pad_token": "<PAD>"})
return super().get_tokenizers(**kwargs) return super().get_tokenizers(**kwargs)
@ -151,8 +152,8 @@ class CodeLlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
] ]
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
tmpdirname2 = tempfile.mkdtemp() tmpdirname2 = tempfile.mkdtemp()
@ -255,7 +256,7 @@ class CodeLlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
added_tokens = [AddedToken("<special>", lstrip=True)] added_tokens = [AddedToken("<special>", lstrip=True)]
tokenizer_r = self.rust_tokenizer_class.from_pretrained( tokenizer_r = self.get_rust_tokenizer(
pretrained_name, additional_special_tokens=added_tokens, **kwargs pretrained_name, additional_special_tokens=added_tokens, **kwargs
) )
r_output = tokenizer_r.encode("Hey this is a <special> token") r_output = tokenizer_r.encode("Hey this is a <special> token")
@ -265,7 +266,7 @@ class CodeLlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertTrue(special_token_id in r_output) self.assertTrue(special_token_id in r_output)
if self.test_slow_tokenizer: if self.test_slow_tokenizer:
tokenizer_cr = self.rust_tokenizer_class.from_pretrained( tokenizer_cr = self.get_rust_tokenizer(
pretrained_name, pretrained_name,
additional_special_tokens=added_tokens, additional_special_tokens=added_tokens,
**kwargs, # , from_slow=True <- unfortunately too slow to convert **kwargs, # , from_slow=True <- unfortunately too slow to convert

View File

@ -18,12 +18,13 @@ import json
import os import os
import re import re
import unittest import unittest
from functools import lru_cache
from transformers import CodeGenTokenizer, CodeGenTokenizerFast from transformers import CodeGenTokenizer, CodeGenTokenizerFast
from transformers.models.codegen.tokenization_codegen import VOCAB_FILES_NAMES from transformers.models.codegen.tokenization_codegen import VOCAB_FILES_NAMES
from transformers.testing_utils import require_tokenizers, slow from transformers.testing_utils import require_tokenizers, slow
from ...test_tokenization_common import TokenizerTesterMixin from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
@require_tokenizers @require_tokenizers
@ -35,8 +36,9 @@ class CodeGenTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_kwargs = {"add_prefix_space": True} from_pretrained_kwargs = {"add_prefix_space": True}
test_seq2seq = False test_seq2seq = False
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
vocab = [ vocab = [
@ -64,22 +66,30 @@ class CodeGenTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
] ]
vocab_tokens = dict(zip(vocab, range(len(vocab)))) vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""] merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
self.special_tokens_map = {"unk_token": "<unk>"} cls.special_tokens_map = {"unk_token": "<unk>"}
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(self.vocab_file, "w", encoding="utf-8") as fp: with open(cls.vocab_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(vocab_tokens) + "\n") fp.write(json.dumps(vocab_tokens) + "\n")
with open(self.merges_file, "w", encoding="utf-8") as fp: with open(cls.merges_file, "w", encoding="utf-8") as fp:
fp.write("\n".join(merges)) fp.write("\n".join(merges))
def get_tokenizer(self, **kwargs): @classmethod
kwargs.update(self.special_tokens_map) @use_cache_if_possible
return CodeGenTokenizer.from_pretrained(self.tmpdirname, **kwargs) @lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return CodeGenTokenizer.from_pretrained(pretrained_name, **kwargs)
def get_rust_tokenizer(self, **kwargs): @classmethod
kwargs.update(self.special_tokens_map) @use_cache_if_possible
return CodeGenTokenizerFast.from_pretrained(self.tmpdirname, **kwargs) @lru_cache(maxsize=64)
def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return CodeGenTokenizerFast.from_pretrained(pretrained_name, **kwargs)
def get_input_output_texts(self, tokenizer): def get_input_output_texts(self, tokenizer):
input_text = "lower newer" input_text = "lower newer"
@ -136,7 +146,7 @@ class CodeGenTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_padding(self, max_length=15): def test_padding(self, max_length=15):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
# Simple input # Simple input
s = "This is a simple input" s = "This is a simple input"

View File

@ -13,12 +13,14 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import copy
import unittest import unittest
from functools import lru_cache
from transformers import CohereTokenizerFast from transformers import CohereTokenizerFast
from transformers.testing_utils import require_jinja, require_tokenizers, require_torch_multi_gpu from transformers.testing_utils import require_jinja, require_tokenizers, require_torch_multi_gpu
from ...test_tokenization_common import TokenizerTesterMixin from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
@require_tokenizers @require_tokenizers
@ -37,14 +39,21 @@ class CohereTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
"pad_token": "<PAD>", "pad_token": "<PAD>",
} }
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
tokenizer = CohereTokenizerFast.from_pretrained("hf-internal-testing/tiny-random-CohereForCausalLM") tokenizer = CohereTokenizerFast.from_pretrained("hf-internal-testing/tiny-random-CohereForCausalLM")
tokenizer.save_pretrained(self.tmpdirname) tokenizer.save_pretrained(cls.tmpdirname)
def get_rust_tokenizer(self, **kwargs): @classmethod
kwargs.update(self.special_tokens_map) @use_cache_if_possible
return CohereTokenizerFast.from_pretrained(self.tmpdirname, **kwargs) @lru_cache(maxsize=64)
def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
_kwargs = copy.deepcopy(cls.special_tokens_map)
_kwargs.update(kwargs)
kwargs = _kwargs
pretrained_name = pretrained_name or cls.tmpdirname
return CohereTokenizerFast.from_pretrained(pretrained_name, **kwargs)
# This gives CPU OOM on a single-gpu runner (~60G RAM). On multi-gpu runner, it has ~180G RAM which is enough. # This gives CPU OOM on a single-gpu runner (~60G RAM). On multi-gpu runner, it has ~180G RAM which is enough.
@require_torch_multi_gpu @require_torch_multi_gpu
@ -80,7 +89,7 @@ class CohereTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_padding(self, max_length=10): def test_padding(self, max_length=10):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
# tokenizer_r.pad_token = None # Hotfixing padding = None # tokenizer_r.pad_token = None # Hotfixing padding = None
# Simple input # Simple input
s = "This is a simple input" s = "This is a simple input"

View File

@ -28,8 +28,9 @@ class CPMAntTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = CpmAntTokenizer tokenizer_class = CpmAntTokenizer
test_rust_tokenizer = False test_rust_tokenizer = False
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
vocab_tokens = [ vocab_tokens = [
"<d>", "<d>",
@ -49,8 +50,8 @@ class CPMAntTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
"n", "n",
"t", "t",
] ]
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
@tooslow @tooslow

View File

@ -16,10 +16,11 @@
import json import json
import os import os
import unittest import unittest
from functools import lru_cache
from transformers.models.ctrl.tokenization_ctrl import VOCAB_FILES_NAMES, CTRLTokenizer from transformers.models.ctrl.tokenization_ctrl import VOCAB_FILES_NAMES, CTRLTokenizer
from ...test_tokenization_common import TokenizerTesterMixin from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
class CTRLTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class CTRLTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
@ -28,25 +29,30 @@ class CTRLTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = False test_rust_tokenizer = False
test_seq2seq = False test_seq2seq = False
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
vocab = ["adapt", "re@@", "a@@", "apt", "c@@", "t", "<unk>"] vocab = ["adapt", "re@@", "a@@", "apt", "c@@", "t", "<unk>"]
vocab_tokens = dict(zip(vocab, range(len(vocab)))) vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["#version: 0.2", "a p", "ap t</w>", "r e", "a d", "ad apt</w>", ""] merges = ["#version: 0.2", "a p", "ap t</w>", "r e", "a d", "ad apt</w>", ""]
self.special_tokens_map = {"unk_token": "<unk>"} cls.special_tokens_map = {"unk_token": "<unk>"}
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(self.vocab_file, "w", encoding="utf-8") as fp: with open(cls.vocab_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(vocab_tokens) + "\n") fp.write(json.dumps(vocab_tokens) + "\n")
with open(self.merges_file, "w", encoding="utf-8") as fp: with open(cls.merges_file, "w", encoding="utf-8") as fp:
fp.write("\n".join(merges)) fp.write("\n".join(merges))
def get_tokenizer(self, **kwargs): @classmethod
kwargs.update(self.special_tokens_map) @use_cache_if_possible
return CTRLTokenizer.from_pretrained(self.tmpdirname, **kwargs) @lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return CTRLTokenizer.from_pretrained(pretrained_name, **kwargs)
def get_input_output_texts(self, tokenizer): def get_input_output_texts(self, tokenizer):
input_text = "adapt react readapt apt" input_text = "adapt react readapt apt"

View File

@ -17,12 +17,13 @@
import json import json
import os import os
import unittest import unittest
from functools import lru_cache
from transformers import DebertaTokenizer, DebertaTokenizerFast from transformers import DebertaTokenizer, DebertaTokenizerFast
from transformers.models.deberta.tokenization_deberta import VOCAB_FILES_NAMES from transformers.models.deberta.tokenization_deberta import VOCAB_FILES_NAMES
from transformers.testing_utils import slow from transformers.testing_utils import slow
from ...test_tokenization_common import TokenizerTesterMixin from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
class DebertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class DebertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
@ -31,8 +32,9 @@ class DebertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = True test_rust_tokenizer = True
rust_tokenizer_class = DebertaTokenizerFast rust_tokenizer_class = DebertaTokenizerFast
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
vocab = [ vocab = [
@ -59,18 +61,22 @@ class DebertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
] ]
vocab_tokens = dict(zip(vocab, range(len(vocab)))) vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""] merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
self.special_tokens_map = {"unk_token": "[UNK]"} cls.special_tokens_map = {"unk_token": "[UNK]"}
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(self.vocab_file, "w", encoding="utf-8") as fp: with open(cls.vocab_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(vocab_tokens) + "\n") fp.write(json.dumps(vocab_tokens) + "\n")
with open(self.merges_file, "w", encoding="utf-8") as fp: with open(cls.merges_file, "w", encoding="utf-8") as fp:
fp.write("\n".join(merges)) fp.write("\n".join(merges))
def get_tokenizer(self, **kwargs): @classmethod
kwargs.update(self.special_tokens_map) @use_cache_if_possible
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) @lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
def get_input_output_texts(self, tokenizer): def get_input_output_texts(self, tokenizer):
input_text = "lower newer" input_text = "lower newer"

View File

@ -33,12 +33,13 @@ class DebertaV2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_sentencepiece = True test_sentencepiece = True
test_sentencepiece_ignore_case = True test_sentencepiece_ignore_case = True
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
# We have a SentencePiece fixture for testing # We have a SentencePiece fixture for testing
tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="<unk>") tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="<unk>")
tokenizer.save_pretrained(self.tmpdirname) tokenizer.save_pretrained(cls.tmpdirname)
def get_input_output_texts(self, tokenizer): def get_input_output_texts(self, tokenizer):
input_text = "this is a test" input_text = "this is a test"

View File

@ -17,11 +17,11 @@
from transformers import DistilBertTokenizer, DistilBertTokenizerFast from transformers import DistilBertTokenizer, DistilBertTokenizerFast
from transformers.testing_utils import require_tokenizers, slow from transformers.testing_utils import require_tokenizers, slow
from ..bert.test_tokenization_bert import BertTokenizationTest from ..bert import test_tokenization_bert
@require_tokenizers @require_tokenizers
class DistilBertTokenizationTest(BertTokenizationTest): class DistilBertTokenizationTest(test_tokenization_bert.BertTokenizationTest):
tokenizer_class = DistilBertTokenizer tokenizer_class = DistilBertTokenizer
rust_tokenizer_class = DistilBertTokenizerFast rust_tokenizer_class = DistilBertTokenizerFast
test_rust_tokenizer = True test_rust_tokenizer = True

View File

@ -25,11 +25,11 @@ from transformers import (
from transformers.testing_utils import require_tokenizers, slow from transformers.testing_utils import require_tokenizers, slow
from transformers.tokenization_utils_base import BatchEncoding from transformers.tokenization_utils_base import BatchEncoding
from ..bert.test_tokenization_bert import BertTokenizationTest from ..bert import test_tokenization_bert
@require_tokenizers @require_tokenizers
class DPRContextEncoderTokenizationTest(BertTokenizationTest): class DPRContextEncoderTokenizationTest(test_tokenization_bert.BertTokenizationTest):
tokenizer_class = DPRContextEncoderTokenizer tokenizer_class = DPRContextEncoderTokenizer
rust_tokenizer_class = DPRContextEncoderTokenizerFast rust_tokenizer_class = DPRContextEncoderTokenizerFast
test_rust_tokenizer = True test_rust_tokenizer = True
@ -37,7 +37,7 @@ class DPRContextEncoderTokenizationTest(BertTokenizationTest):
@require_tokenizers @require_tokenizers
class DPRQuestionEncoderTokenizationTest(BertTokenizationTest): class DPRQuestionEncoderTokenizationTest(test_tokenization_bert.BertTokenizationTest):
tokenizer_class = DPRQuestionEncoderTokenizer tokenizer_class = DPRQuestionEncoderTokenizer
rust_tokenizer_class = DPRQuestionEncoderTokenizerFast rust_tokenizer_class = DPRQuestionEncoderTokenizerFast
test_rust_tokenizer = True test_rust_tokenizer = True
@ -45,7 +45,7 @@ class DPRQuestionEncoderTokenizationTest(BertTokenizationTest):
@require_tokenizers @require_tokenizers
class DPRReaderTokenizationTest(BertTokenizationTest): class DPRReaderTokenizationTest(test_tokenization_bert.BertTokenizationTest):
tokenizer_class = DPRReaderTokenizer tokenizer_class = DPRReaderTokenizer
rust_tokenizer_class = DPRReaderTokenizerFast rust_tokenizer_class = DPRReaderTokenizerFast
test_rust_tokenizer = True test_rust_tokenizer = True

View File

@ -40,8 +40,9 @@ class ElectraTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
space_between_special_tokens = True space_between_special_tokens = True
from_pretrained_filter = filter_non_english from_pretrained_filter = filter_non_english
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
vocab_tokens = [ vocab_tokens = [
"[UNK]", "[UNK]",
@ -60,8 +61,8 @@ class ElectraTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
"low", "low",
"lowest", "lowest",
] ]
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
def get_input_output_texts(self, tokenizer): def get_input_output_texts(self, tokenizer):
@ -250,7 +251,7 @@ class ElectraTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_offsets_with_special_characters(self): def test_offsets_with_special_characters(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence." sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence."
tokens = tokenizer_r.encode_plus( tokens = tokenizer_r.encode_plus(
@ -305,8 +306,8 @@ class ElectraTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
kwargs["tokenize_chinese_chars"] = True kwargs["tokenize_chinese_chars"] = True
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False) ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)
ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False) ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
@ -319,8 +320,8 @@ class ElectraTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertListEqual(tokens_without_spe_char_r, list_of_commun_chinese_char) self.assertListEqual(tokens_without_spe_char_r, list_of_commun_chinese_char)
kwargs["tokenize_chinese_chars"] = False kwargs["tokenize_chinese_chars"] = False
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False) ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False) ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)

View File

@ -17,6 +17,7 @@
import os import os
import tempfile import tempfile
import unittest import unittest
from functools import lru_cache
from typing import List from typing import List
from transformers.models.esm.tokenization_esm import VOCAB_FILES_NAMES, EsmTokenizer from transformers.models.esm.tokenization_esm import VOCAB_FILES_NAMES, EsmTokenizer
@ -24,24 +25,32 @@ from transformers.testing_utils import require_tokenizers
from transformers.tokenization_utils import PreTrainedTokenizer from transformers.tokenization_utils import PreTrainedTokenizer
from transformers.tokenization_utils_base import PreTrainedTokenizerBase from transformers.tokenization_utils_base import PreTrainedTokenizerBase
from ...test_tokenization_common import use_cache_if_possible
@require_tokenizers @require_tokenizers
class ESMTokenizationTest(unittest.TestCase): class ESMTokenizationTest(unittest.TestCase):
tokenizer_class = EsmTokenizer tokenizer_class = EsmTokenizer
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
self.tmpdirname = tempfile.mkdtemp() super().setUpClass()
cls.tmpdirname = tempfile.mkdtemp()
vocab_tokens: List[str] = ["<cls>", "<pad>", "<eos>", "<unk>", "L", "A", "G", "V", "S", "E", "R", "T", "I", "D", "P", "K", "Q", "N", "F", "Y", "M", "H", "W", "C", "X", "B", "U", "Z", "O", ".", "-", "<null_1>", "<mask>"] # fmt: skip vocab_tokens: List[str] = ["<cls>", "<pad>", "<eos>", "<unk>", "L", "A", "G", "V", "S", "E", "R", "T", "I", "D", "P", "K", "Q", "N", "F", "Y", "M", "H", "W", "C", "X", "B", "U", "Z", "O", ".", "-", "<null_1>", "<mask>"] # fmt: skip
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
def get_tokenizers(self, **kwargs) -> List[PreTrainedTokenizerBase]: def get_tokenizers(cls, **kwargs) -> List[PreTrainedTokenizerBase]:
return [self.get_tokenizer(**kwargs)] return [cls.get_tokenizer(**kwargs)]
def get_tokenizer(self, **kwargs) -> PreTrainedTokenizer: @classmethod
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) @use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs) -> PreTrainedTokenizer:
pretrained_name = pretrained_name or cls.tmpdirname
return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
def test_tokenizer_single_example(self): def test_tokenizer_single_example(self):
tokenizer = self.tokenizer_class(self.vocab_file) tokenizer = self.tokenizer_class(self.vocab_file)

View File

@ -28,10 +28,11 @@ class FastSpeech2ConformerTokenizerTest(TokenizerTesterMixin, unittest.TestCase)
tokenizer_class = FastSpeech2ConformerTokenizer tokenizer_class = FastSpeech2ConformerTokenizer
test_rust_tokenizer = False test_rust_tokenizer = False
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
tokenizer = FastSpeech2ConformerTokenizer.from_pretrained("espnet/fastspeech2_conformer") tokenizer = FastSpeech2ConformerTokenizer.from_pretrained("espnet/fastspeech2_conformer")
tokenizer.save_pretrained(self.tmpdirname) tokenizer.save_pretrained(cls.tmpdirname)
def get_input_output_texts(self, tokenizer): def get_input_output_texts(self, tokenizer):
input_text = "this is a test" input_text = "this is a test"

View File

@ -30,8 +30,9 @@ class FlaubertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = FlaubertTokenizer tokenizer_class = FlaubertTokenizer
test_rust_tokenizer = False test_rust_tokenizer = False
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "w</w>", "r</w>", "t</w>", "i</w>", "lo", "low", "ne", "new", "er</w>", "low</w>", "lowest</w>", "new</w>", "newer</w>", "wider</w>", "<unk>"] # fmt: skip vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "w</w>", "r</w>", "t</w>", "i</w>", "lo", "low", "ne", "new", "er</w>", "low</w>", "lowest</w>", "new</w>", "newer</w>", "wider</w>", "<unk>"] # fmt: skip
@ -39,11 +40,11 @@ class FlaubertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
vocab_tokens = dict(zip(vocab, range(len(vocab)))) vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["n e 300", "ne w 301", "e r</w> 302", ""] merges = ["n e 300", "ne w 301", "e r</w> 302", ""]
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(self.vocab_file, "w", encoding="utf-8") as fp: with open(cls.vocab_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(vocab_tokens) + "\n") fp.write(json.dumps(vocab_tokens) + "\n")
with open(self.merges_file, "w", encoding="utf-8") as fp: with open(cls.merges_file, "w", encoding="utf-8") as fp:
fp.write("\n".join(merges)) fp.write("\n".join(merges))
# Copied from transformers.tests.models.xlm.test_tokenization_xlm.XLMTokenizationTest.test_full_tokenizer # Copied from transformers.tests.models.xlm.test_tokenization_xlm.XLMTokenizationTest.test_full_tokenizer

View File

@ -36,12 +36,13 @@ class FNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_sentencepiece_ignore_case = True test_sentencepiece_ignore_case = True
test_seq2seq = False test_seq2seq = False
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
# We have a SentencePiece fixture for testing # We have a SentencePiece fixture for testing
tokenizer = FNetTokenizer(SAMPLE_VOCAB) tokenizer = FNetTokenizer(SAMPLE_VOCAB)
tokenizer.save_pretrained(self.tmpdirname) tokenizer.save_pretrained(cls.tmpdirname)
def get_input_output_texts(self, tokenizer): def get_input_output_texts(self, tokenizer):
input_text = "this is a test" input_text = "this is a test"
@ -147,7 +148,7 @@ class FNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
added_tokens = [AddedToken("<special>", lstrip=True)] added_tokens = [AddedToken("<special>", lstrip=True)]
tokenizer_r = self.rust_tokenizer_class.from_pretrained( tokenizer_r = self.get_rust_tokenizer(
pretrained_name, additional_special_tokens=added_tokens, **kwargs pretrained_name, additional_special_tokens=added_tokens, **kwargs
) )
r_output = tokenizer_r.encode("Hey this is a <special> token") r_output = tokenizer_r.encode("Hey this is a <special> token")
@ -175,7 +176,7 @@ class FNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
added_tokens = [AddedToken("<special>", lstrip=True)] added_tokens = [AddedToken("<special>", lstrip=True)]
tokenizer_r = self.rust_tokenizer_class.from_pretrained( tokenizer_r = self.get_rust_tokenizer(
pretrained_name, additional_special_tokens=added_tokens, **kwargs, from_slow=True pretrained_name, additional_special_tokens=added_tokens, **kwargs, from_slow=True
) )
special_token_id = tokenizer_r.encode("<special>", add_special_tokens=False)[0] special_token_id = tokenizer_r.encode("<special>", add_special_tokens=False)[0]
@ -198,8 +199,8 @@ class FNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id) self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
pad_token_id = tokenizer_p.pad_token_id pad_token_id = tokenizer_p.pad_token_id

View File

@ -34,8 +34,9 @@ class FSMTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = FSMTTokenizer tokenizer_class = FSMTTokenizer
test_rust_tokenizer = False test_rust_tokenizer = False
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
vocab = [ vocab = [
@ -64,22 +65,22 @@ class FSMTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
vocab_tokens = dict(zip(vocab, range(len(vocab)))) vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""] merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""]
self.langs = ["en", "ru"] cls.langs = ["en", "ru"]
config = { config = {
"langs": self.langs, "langs": cls.langs,
"src_vocab_size": 10, "src_vocab_size": 10,
"tgt_vocab_size": 20, "tgt_vocab_size": 20,
} }
self.src_vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["src_vocab_file"]) cls.src_vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["src_vocab_file"])
self.tgt_vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["tgt_vocab_file"]) cls.tgt_vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["tgt_vocab_file"])
config_file = os.path.join(self.tmpdirname, "tokenizer_config.json") config_file = os.path.join(cls.tmpdirname, "tokenizer_config.json")
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(self.src_vocab_file, "w") as fp: with open(cls.src_vocab_file, "w") as fp:
fp.write(json.dumps(vocab_tokens)) fp.write(json.dumps(vocab_tokens))
with open(self.tgt_vocab_file, "w") as fp: with open(cls.tgt_vocab_file, "w") as fp:
fp.write(json.dumps(vocab_tokens)) fp.write(json.dumps(vocab_tokens))
with open(self.merges_file, "w") as fp: with open(cls.merges_file, "w") as fp:
fp.write("\n".join(merges)) fp.write("\n".join(merges))
with open(config_file, "w") as fp: with open(config_file, "w") as fp:
fp.write(json.dumps(config)) fp.write(json.dumps(config))

View File

@ -16,12 +16,13 @@
import os import os
import unittest import unittest
from functools import lru_cache
from transformers import FunnelTokenizer, FunnelTokenizerFast from transformers import FunnelTokenizer, FunnelTokenizerFast
from transformers.models.funnel.tokenization_funnel import VOCAB_FILES_NAMES from transformers.models.funnel.tokenization_funnel import VOCAB_FILES_NAMES
from transformers.testing_utils import require_tokenizers from transformers.testing_utils import require_tokenizers
from ...test_tokenization_common import TokenizerTesterMixin from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
@require_tokenizers @require_tokenizers
@ -32,8 +33,9 @@ class FunnelTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = True test_rust_tokenizer = True
space_between_special_tokens = True space_between_special_tokens = True
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
vocab_tokens = [ vocab_tokens = [
"<unk>", "<unk>",
@ -50,15 +52,23 @@ class FunnelTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
"low", "low",
"lowest", "lowest",
] ]
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
def get_tokenizer(self, **kwargs): @classmethod
return FunnelTokenizer.from_pretrained(self.tmpdirname, **kwargs) @use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs):
pretrained_name = pretrained_name or cls.tmpdirname
return FunnelTokenizer.from_pretrained(pretrained_name, **kwargs)
def get_rust_tokenizer(self, **kwargs): @classmethod
return FunnelTokenizerFast.from_pretrained(self.tmpdirname, **kwargs) @use_cache_if_possible
@lru_cache(maxsize=64)
def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
pretrained_name = pretrained_name or cls.tmpdirname
return FunnelTokenizerFast.from_pretrained(pretrained_name, **kwargs)
def get_input_output_texts(self, tokenizer): def get_input_output_texts(self, tokenizer):
input_text = "UNwant\u00e9d,running" input_text = "UNwant\u00e9d,running"

View File

@ -53,12 +53,13 @@ class GemmaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_sentencepiece = True test_sentencepiece = True
from_pretrained_kwargs = {} from_pretrained_kwargs = {}
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
# We have a SentencePiece fixture for testing # We have a SentencePiece fixture for testing
tokenizer = GemmaTokenizer(SAMPLE_VOCAB, keep_accents=True) tokenizer = GemmaTokenizer(SAMPLE_VOCAB, keep_accents=True)
tokenizer.pad_token = tokenizer.eos_token tokenizer.pad_token = tokenizer.eos_token
tokenizer.save_pretrained(self.tmpdirname) tokenizer.save_pretrained(cls.tmpdirname)
@require_torch @require_torch
def test_batch_tokenization(self): def test_batch_tokenization(self):
@ -103,7 +104,7 @@ class GemmaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
added_tokens = [AddedToken("<special>", lstrip=True)] added_tokens = [AddedToken("<special>", lstrip=True)]
tokenizer_r = self.rust_tokenizer_class.from_pretrained( tokenizer_r = self.get_rust_tokenizer(
pretrained_name, additional_special_tokens=added_tokens, **kwargs pretrained_name, additional_special_tokens=added_tokens, **kwargs
) )
r_output = tokenizer_r.encode("Hey this is a <special> token") r_output = tokenizer_r.encode("Hey this is a <special> token")
@ -113,7 +114,7 @@ class GemmaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertTrue(special_token_id in r_output) self.assertTrue(special_token_id in r_output)
if self.test_slow_tokenizer: if self.test_slow_tokenizer:
tokenizer_cr = self.rust_tokenizer_class.from_pretrained( tokenizer_cr = self.get_rust_tokenizer(
pretrained_name, pretrained_name,
additional_special_tokens=added_tokens, additional_special_tokens=added_tokens,
**kwargs, # , from_slow=True <- unfortunately too slow to convert **kwargs, # , from_slow=True <- unfortunately too slow to convert

View File

@ -17,12 +17,13 @@
import json import json
import os import os
import unittest import unittest
from functools import lru_cache
from transformers import AutoTokenizer, GPT2Tokenizer, GPT2TokenizerFast from transformers import AutoTokenizer, GPT2Tokenizer, GPT2TokenizerFast
from transformers.models.gpt2.tokenization_gpt2 import VOCAB_FILES_NAMES from transformers.models.gpt2.tokenization_gpt2 import VOCAB_FILES_NAMES
from transformers.testing_utils import require_jinja, require_tiktoken, require_tokenizers from transformers.testing_utils import require_jinja, require_tiktoken, require_tokenizers
from ...test_tokenization_common import TokenizerTesterMixin from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
@require_tokenizers @require_tokenizers
@ -34,8 +35,9 @@ class GPT2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_kwargs = {"add_prefix_space": True} from_pretrained_kwargs = {"add_prefix_space": True}
test_seq2seq = False test_seq2seq = False
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
vocab = [ vocab = [
@ -63,22 +65,30 @@ class GPT2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
] ]
vocab_tokens = dict(zip(vocab, range(len(vocab)))) vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""] merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
self.special_tokens_map = {"unk_token": "<unk>"} cls.special_tokens_map = {"unk_token": "<unk>"}
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(self.vocab_file, "w", encoding="utf-8") as fp: with open(cls.vocab_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(vocab_tokens) + "\n") fp.write(json.dumps(vocab_tokens) + "\n")
with open(self.merges_file, "w", encoding="utf-8") as fp: with open(cls.merges_file, "w", encoding="utf-8") as fp:
fp.write("\n".join(merges)) fp.write("\n".join(merges))
def get_tokenizer(self, **kwargs): @classmethod
kwargs.update(self.special_tokens_map) @use_cache_if_possible
return GPT2Tokenizer.from_pretrained(self.tmpdirname, **kwargs) @lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return GPT2Tokenizer.from_pretrained(pretrained_name, **kwargs)
def get_rust_tokenizer(self, **kwargs): @classmethod
kwargs.update(self.special_tokens_map) @use_cache_if_possible
return GPT2TokenizerFast.from_pretrained(self.tmpdirname, **kwargs) @lru_cache(maxsize=64)
def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return GPT2TokenizerFast.from_pretrained(pretrained_name, **kwargs)
def get_input_output_texts(self, tokenizer): def get_input_output_texts(self, tokenizer):
input_text = "lower newer" input_text = "lower newer"
@ -135,7 +145,7 @@ class GPT2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_padding(self, max_length=15): def test_padding(self, max_length=15):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
# Simple input # Simple input
s = "This is a simple input" s = "This is a simple input"

View File

@ -17,6 +17,7 @@
import json import json
import os import os
import unittest import unittest
from functools import lru_cache
from transformers.models.gpt_neox_japanese.tokenization_gpt_neox_japanese import ( from transformers.models.gpt_neox_japanese.tokenization_gpt_neox_japanese import (
VOCAB_FILES_NAMES, VOCAB_FILES_NAMES,
@ -24,7 +25,7 @@ from transformers.models.gpt_neox_japanese.tokenization_gpt_neox_japanese import
) )
from transformers.testing_utils import require_tokenizers, slow from transformers.testing_utils import require_tokenizers, slow
from ...test_tokenization_common import TokenizerTesterMixin from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
@require_tokenizers @require_tokenizers
@ -34,8 +35,9 @@ class GPTNeoXJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = False test_rust_tokenizer = False
from_pretrained_kwargs = {"do_clean_text": False, "add_prefix_space": False} from_pretrained_kwargs = {"do_clean_text": False, "add_prefix_space": False}
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
vocab_tokens = [ vocab_tokens = [
"こん", "こん",
@ -62,18 +64,22 @@ class GPTNeoXJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
"<|endoftext|>", "<|endoftext|>",
] ]
emoji_tokens = {"emoji": {"\ud83d\ude00": "<|emoji1|>"}, "emoji_inv": {"<|emoji1|>": "\ud83d\ude00"}} # 😀 emoji_tokens = {"emoji": {"\ud83d\ude00": "<|emoji1|>"}, "emoji_inv": {"<|emoji1|>": "\ud83d\ude00"}} # 😀
self.special_tokens_map = {"unk_token": "<unk>"} cls.special_tokens_map = {"unk_token": "<unk>"}
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
self.emoji_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["emoji_file"]) cls.emoji_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["emoji_file"])
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
with open(self.emoji_file, "w") as emoji_writer: with open(cls.emoji_file, "w") as emoji_writer:
emoji_writer.write(json.dumps(emoji_tokens)) emoji_writer.write(json.dumps(emoji_tokens))
def get_tokenizer(self, **kwargs): @classmethod
kwargs.update(self.special_tokens_map) @use_cache_if_possible
return GPTNeoXJapaneseTokenizer.from_pretrained(self.tmpdirname, **kwargs) @lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return GPTNeoXJapaneseTokenizer.from_pretrained(pretrained_name, **kwargs)
def get_input_output_texts(self, tokenizer): def get_input_output_texts(self, tokenizer):
input_text = "こんにちは、世界。 \nこんばんは、㔺界。😀" input_text = "こんにちは、世界。 \nこんばんは、㔺界。😀"

View File

@ -33,13 +33,14 @@ class GPTSw3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_sentencepiece = True test_sentencepiece = True
test_sentencepiece_ignore_case = False test_sentencepiece_ignore_case = False
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
# We have a SentencePiece fixture for testing # We have a SentencePiece fixture for testing
tokenizer = GPTSw3Tokenizer(SAMPLE_VOCAB, eos_token="<unk>", bos_token="<unk>", pad_token="<unk>") tokenizer = GPTSw3Tokenizer(SAMPLE_VOCAB, eos_token="<unk>", bos_token="<unk>", pad_token="<unk>")
tokenizer.save_pretrained(self.tmpdirname) tokenizer.save_pretrained(cls.tmpdirname)
def get_input_output_texts(self, tokenizer): def get_input_output_texts(self, tokenizer):
input_text = "This is a test" input_text = "This is a test"

View File

@ -33,12 +33,13 @@ class HerbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
rust_tokenizer_class = HerbertTokenizerFast rust_tokenizer_class = HerbertTokenizerFast
test_rust_tokenizer = True test_rust_tokenizer = True
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
# Use a simpler test file without japanese/chinese characters # Use a simpler test file without japanese/chinese characters
with open(f"{get_tests_dir()}/fixtures/sample_text_no_unicode.txt", encoding="utf-8") as f_data: with open(f"{get_tests_dir()}/fixtures/sample_text_no_unicode.txt", encoding="utf-8") as f_data:
self._data = f_data.read().replace("\n\n", "\n").strip() cls._data = f_data.read().replace("\n\n", "\n").strip()
vocab = [ vocab = [
"<s>", "<s>",
@ -69,11 +70,11 @@ class HerbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
vocab_tokens = dict(zip(vocab, range(len(vocab)))) vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""] merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""]
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(self.vocab_file, "w") as fp: with open(cls.vocab_file, "w") as fp:
fp.write(json.dumps(vocab_tokens)) fp.write(json.dumps(vocab_tokens))
with open(self.merges_file, "w") as fp: with open(cls.merges_file, "w") as fp:
fp.write("\n".join(merges)) fp.write("\n".join(merges))
def get_input_output_texts(self, tokenizer): def get_input_output_texts(self, tokenizer):

View File

@ -16,12 +16,13 @@
import os import os
import unittest import unittest
from functools import lru_cache
from transformers import LayoutLMTokenizer, LayoutLMTokenizerFast from transformers import LayoutLMTokenizer, LayoutLMTokenizerFast
from transformers.models.layoutlm.tokenization_layoutlm import VOCAB_FILES_NAMES from transformers.models.layoutlm.tokenization_layoutlm import VOCAB_FILES_NAMES
from transformers.testing_utils import require_tokenizers from transformers.testing_utils import require_tokenizers
from ...test_tokenization_common import TokenizerTesterMixin from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
@require_tokenizers @require_tokenizers
@ -32,8 +33,9 @@ class LayoutLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = True test_rust_tokenizer = True
space_between_special_tokens = True space_between_special_tokens = True
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
vocab_tokens = [ vocab_tokens = [
"[UNK]", "[UNK]",
@ -50,12 +52,16 @@ class LayoutLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
"low", "low",
"lowest", "lowest",
] ]
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
def get_tokenizer(self, **kwargs): @classmethod
return LayoutLMTokenizer.from_pretrained(self.tmpdirname, **kwargs) @use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs):
pretrained_name = pretrained_name or cls.tmpdirname
return LayoutLMTokenizer.from_pretrained(pretrained_name, **kwargs)
def get_input_output_texts(self, tokenizer): def get_input_output_texts(self, tokenizer):
input_text = "UNwant\u00e9d,running" input_text = "UNwant\u00e9d,running"

View File

@ -102,8 +102,9 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
return questions, words, boxes return questions, words, boxes
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
vocab_tokens = [ vocab_tokens = [
"[UNK]", "[UNK]",
@ -122,8 +123,8 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
"test", "test",
"lowest", "lowest",
] ]
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
def get_input_output_texts(self, tokenizer): def get_input_output_texts(self, tokenizer):
@ -267,7 +268,7 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_offsets_with_special_characters(self): def test_offsets_with_special_characters(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
words, boxes = self.get_words_and_boxes() words, boxes = self.get_words_and_boxes()
words[1] = tokenizer_r.mask_token words[1] = tokenizer_r.mask_token
@ -605,8 +606,8 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_padding(self, max_length=50): def test_padding(self, max_length=50):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id) self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
pad_token_id = tokenizer_p.pad_token_id pad_token_id = tokenizer_p.pad_token_id
@ -1060,7 +1061,7 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
# Input tokens id # Input tokens id
words, boxes = self.get_words_and_boxes() words, boxes = self.get_words_and_boxes()
@ -1363,7 +1364,7 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
words, boxes = self.get_words_and_boxes() words, boxes = self.get_words_and_boxes()
@ -1417,7 +1418,7 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
words, boxes = self.get_words_and_boxes() words, boxes = self.get_words_and_boxes()
tokens_r = tokenizer_r.encode_plus( tokens_r = tokenizer_r.encode_plus(
words, words,
@ -1715,7 +1716,7 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id) self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
pad_token_id = tokenizer_p.pad_token_id pad_token_id = tokenizer_p.pad_token_id

View File

@ -20,6 +20,7 @@ import re
import shutil import shutil
import tempfile import tempfile
import unittest import unittest
from functools import lru_cache
from typing import List from typing import List
from parameterized import parameterized from parameterized import parameterized
@ -41,7 +42,12 @@ from transformers.testing_utils import (
slow, slow,
) )
from ...test_tokenization_common import SMALL_TRAINING_CORPUS, TokenizerTesterMixin, merge_model_tokenizer_mappings from ...test_tokenization_common import (
SMALL_TRAINING_CORPUS,
TokenizerTesterMixin,
merge_model_tokenizer_mappings,
use_cache_if_possible,
)
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
@ -91,8 +97,9 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
return questions, words, boxes return questions, words, boxes
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
vocab = [ vocab = [
@ -119,22 +126,30 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
] ]
vocab_tokens = dict(zip(vocab, range(len(vocab)))) vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""] merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
self.special_tokens_map = {"unk_token": "<unk>"} cls.special_tokens_map = {"unk_token": "<unk>"}
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(self.vocab_file, "w", encoding="utf-8") as fp: with open(cls.vocab_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(vocab_tokens) + "\n") fp.write(json.dumps(vocab_tokens) + "\n")
with open(self.merges_file, "w", encoding="utf-8") as fp: with open(cls.merges_file, "w", encoding="utf-8") as fp:
fp.write("\n".join(merges)) fp.write("\n".join(merges))
def get_tokenizer(self, **kwargs): @classmethod
kwargs.update(self.special_tokens_map) @use_cache_if_possible
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) @lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
def get_rust_tokenizer(self, **kwargs): @classmethod
kwargs.update(self.special_tokens_map) @use_cache_if_possible
return LayoutLMv3TokenizerFast.from_pretrained(self.tmpdirname, **kwargs) @lru_cache(maxsize=64)
def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return LayoutLMv3TokenizerFast.from_pretrained(pretrained_name, **kwargs)
def get_input_output_texts(self, tokenizer): def get_input_output_texts(self, tokenizer):
input_text = "lower newer" input_text = "lower newer"
@ -485,8 +500,8 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_padding(self, max_length=50): def test_padding(self, max_length=50):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id) self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
pad_token_id = tokenizer_p.pad_token_id pad_token_id = tokenizer_p.pad_token_id
@ -940,7 +955,7 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
# Input tokens id # Input tokens id
words, boxes = self.get_words_and_boxes() words, boxes = self.get_words_and_boxes()
@ -1241,7 +1256,7 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
words, boxes = self.get_words_and_boxes() words, boxes = self.get_words_and_boxes()
@ -1295,7 +1310,7 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
words, boxes = self.get_words_and_boxes() words, boxes = self.get_words_and_boxes()
tokens_r = tokenizer_r.encode_plus( tokens_r = tokenizer_r.encode_plus(
words, words,
@ -1593,7 +1608,7 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id) self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
pad_token_id = tokenizer_p.pad_token_id pad_token_id = tokenizer_p.pad_token_id

View File

@ -96,12 +96,13 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
return questions, words, boxes return questions, words, boxes
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
# We have a SentencePiece fixture for testing # We have a SentencePiece fixture for testing
tokenizer = LayoutXLMTokenizer(SAMPLE_VOCAB, keep_accents=True) tokenizer = LayoutXLMTokenizer(SAMPLE_VOCAB, keep_accents=True)
tokenizer.save_pretrained(self.tmpdirname) tokenizer.save_pretrained(cls.tmpdirname)
def get_input_output_texts(self, tokenizer): def get_input_output_texts(self, tokenizer):
input_text = "UNwant\u00e9d,running" input_text = "UNwant\u00e9d,running"
@ -157,7 +158,7 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
_, _, boxes = self.get_question_words_and_boxes() _, _, boxes = self.get_question_words_and_boxes()
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_rust = self.rust_tokenizer_class.from_pretrained( tokenizer_rust = self.get_rust_tokenizer(
pretrained_name, additional_special_tokens=[special_token], split_special_tokens=True, **kwargs pretrained_name, additional_special_tokens=[special_token], split_special_tokens=True, **kwargs
) )
tokenizer_py = self.tokenizer_class.from_pretrained( tokenizer_py = self.tokenizer_class.from_pretrained(
@ -206,7 +207,7 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_offsets_with_special_characters(self): def test_offsets_with_special_characters(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
words, boxes = self.get_words_and_boxes() words, boxes = self.get_words_and_boxes()
words[1] = tokenizer_r.mask_token words[1] = tokenizer_r.mask_token
@ -536,8 +537,8 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_padding(self, max_length=50): def test_padding(self, max_length=50):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id) self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
pad_token_id = tokenizer_p.pad_token_id pad_token_id = tokenizer_p.pad_token_id
@ -990,8 +991,8 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
# Input tokens id # Input tokens id
words, boxes = self.get_words_and_boxes() words, boxes = self.get_words_and_boxes()
@ -1292,7 +1293,7 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
words, boxes = self.get_words_and_boxes() words, boxes = self.get_words_and_boxes()
@ -1346,7 +1347,7 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
words, boxes = self.get_words_and_boxes() words, boxes = self.get_words_and_boxes()
tokens_r = tokenizer_r.encode_plus( tokens_r = tokenizer_r.encode_plus(
words, words,
@ -1644,7 +1645,7 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id) self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
pad_token_id = tokenizer_p.pad_token_id pad_token_id = tokenizer_p.pad_token_id
@ -1743,7 +1744,7 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
tmpdirname2 = tempfile.mkdtemp() tmpdirname2 = tempfile.mkdtemp()

View File

@ -14,13 +14,14 @@
import json import json
import os import os
import unittest import unittest
from functools import lru_cache
from transformers import BatchEncoding, LEDTokenizer, LEDTokenizerFast from transformers import BatchEncoding, LEDTokenizer, LEDTokenizerFast
from transformers.models.led.tokenization_led import VOCAB_FILES_NAMES from transformers.models.led.tokenization_led import VOCAB_FILES_NAMES
from transformers.testing_utils import require_tokenizers, require_torch from transformers.testing_utils import require_tokenizers, require_torch
from transformers.utils import cached_property from transformers.utils import cached_property
from ...test_tokenization_common import TokenizerTesterMixin from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
@require_tokenizers @require_tokenizers
@ -30,8 +31,10 @@ class TestTokenizationLED(TokenizerTesterMixin, unittest.TestCase):
rust_tokenizer_class = LEDTokenizerFast rust_tokenizer_class = LEDTokenizerFast
test_rust_tokenizer = True test_rust_tokenizer = True
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
vocab = [ vocab = [
"l", "l",
"o", "o",
@ -56,22 +59,30 @@ class TestTokenizationLED(TokenizerTesterMixin, unittest.TestCase):
] ]
vocab_tokens = dict(zip(vocab, range(len(vocab)))) vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""] merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
self.special_tokens_map = {"unk_token": "<unk>"} cls.special_tokens_map = {"unk_token": "<unk>"}
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(self.vocab_file, "w", encoding="utf-8") as fp: with open(cls.vocab_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(vocab_tokens) + "\n") fp.write(json.dumps(vocab_tokens) + "\n")
with open(self.merges_file, "w", encoding="utf-8") as fp: with open(cls.merges_file, "w", encoding="utf-8") as fp:
fp.write("\n".join(merges)) fp.write("\n".join(merges))
def get_tokenizer(self, **kwargs): @classmethod
kwargs.update(self.special_tokens_map) @use_cache_if_possible
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) @lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
def get_rust_tokenizer(self, **kwargs): @classmethod
kwargs.update(self.special_tokens_map) @use_cache_if_possible
return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) @lru_cache(maxsize=64)
def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return cls.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
def get_input_output_texts(self, tokenizer): def get_input_output_texts(self, tokenizer):
return "lower newer", "lower newer" return "lower newer", "lower newer"
@ -161,8 +172,8 @@ class TestTokenizationLED(TokenizerTesterMixin, unittest.TestCase):
def test_embeded_special_tokens(self): def test_embeded_special_tokens(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
sentence = "A, <mask> AllenNLP sentence." sentence = "A, <mask> AllenNLP sentence."
tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True) tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True) tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)

View File

@ -60,13 +60,14 @@ class LlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_sentencepiece = True test_sentencepiece = True
from_pretrained_kwargs = {} from_pretrained_kwargs = {}
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
# We have a SentencePiece fixture for testing # We have a SentencePiece fixture for testing
tokenizer = LlamaTokenizer(SAMPLE_VOCAB, keep_accents=True) tokenizer = LlamaTokenizer(SAMPLE_VOCAB, keep_accents=True)
tokenizer.pad_token = tokenizer.eos_token tokenizer.pad_token = tokenizer.eos_token
tokenizer.save_pretrained(self.tmpdirname) tokenizer.save_pretrained(cls.tmpdirname)
def get_tokenizers(self, **kwargs): def get_tokenizers(self, **kwargs):
kwargs.update({"pad_token": "<PAD>"}) kwargs.update({"pad_token": "<PAD>"})
@ -149,8 +150,8 @@ class LlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.tokenizers_list += (self.rust_tokenizer_class, "hf-internal-testing/llama-tokenizer", {}) self.tokenizers_list += (self.rust_tokenizer_class, "hf-internal-testing/llama-tokenizer", {})
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
tmpdirname2 = tempfile.mkdtemp() tmpdirname2 = tempfile.mkdtemp()
@ -253,7 +254,7 @@ class LlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
added_tokens = [AddedToken("<special>", lstrip=True)] added_tokens = [AddedToken("<special>", lstrip=True)]
tokenizer_r = self.rust_tokenizer_class.from_pretrained( tokenizer_r = self.get_rust_tokenizer(
pretrained_name, additional_special_tokens=added_tokens, **kwargs pretrained_name, additional_special_tokens=added_tokens, **kwargs
) )
r_output = tokenizer_r.encode("Hey this is a <special> token") r_output = tokenizer_r.encode("Hey this is a <special> token")
@ -263,7 +264,7 @@ class LlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertTrue(special_token_id in r_output) self.assertTrue(special_token_id in r_output)
if self.test_slow_tokenizer: if self.test_slow_tokenizer:
tokenizer_cr = self.rust_tokenizer_class.from_pretrained( tokenizer_cr = self.get_rust_tokenizer(
pretrained_name, pretrained_name,
additional_special_tokens=added_tokens, additional_special_tokens=added_tokens,
**kwargs, # , from_slow=True <- unfortunately too slow to convert **kwargs, # , from_slow=True <- unfortunately too slow to convert
@ -313,8 +314,8 @@ class LlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
EXPECTED_WITH_SPACE = [1, 18637, 920, 526, 366, 2599] EXPECTED_WITH_SPACE = [1, 18637, 920, 526, 366, 2599]
EXPECTED_WO_SPACE = [1, 29950, 1032, 920, 526, 366, 2599] EXPECTED_WO_SPACE = [1, 29950, 1032, 920, 526, 366, 2599]
slow_ = self.tokenizer_class.from_pretrained(pretrained_name, add_prefix_space=False, legacy=False) slow_ = self.get_tokenizer(pretrained_name, add_prefix_space=False, legacy=False)
fast_ = self.rust_tokenizer_class.from_pretrained(pretrained_name, add_prefix_space=False, legacy=False) fast_ = self.get_rust_tokenizer(pretrained_name, add_prefix_space=False, legacy=False)
self.assertEqual(slow_.encode(inputs), EXPECTED_WO_SPACE) self.assertEqual(slow_.encode(inputs), EXPECTED_WO_SPACE)
self.assertEqual(slow_.encode(inputs), fast_.encode(inputs)) self.assertEqual(slow_.encode(inputs), fast_.encode(inputs))
self.assertEqual(slow_.tokenize(inputs), ["H", "ey", "▁how", "▁are", "▁you", "▁doing"]) self.assertEqual(slow_.tokenize(inputs), ["H", "ey", "▁how", "▁are", "▁you", "▁doing"])
@ -324,8 +325,8 @@ class LlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
fast_.decode(EXPECTED_WO_SPACE, skip_special_tokens=True), fast_.decode(EXPECTED_WO_SPACE, skip_special_tokens=True),
) )
slow_ = self.tokenizer_class.from_pretrained(pretrained_name, add_prefix_space=True, legacy=False) slow_ = self.get_tokenizer(pretrained_name, add_prefix_space=True, legacy=False)
fast_ = self.rust_tokenizer_class.from_pretrained(pretrained_name, add_prefix_space=True, legacy=False) fast_ = self.get_rust_tokenizer(pretrained_name, add_prefix_space=True, legacy=False)
self.assertEqual(slow_.encode(inputs), EXPECTED_WITH_SPACE) self.assertEqual(slow_.encode(inputs), EXPECTED_WITH_SPACE)
self.assertEqual(slow_.encode(inputs), fast_.encode(inputs)) self.assertEqual(slow_.encode(inputs), fast_.encode(inputs))
self.assertEqual(slow_.tokenize(inputs), ["▁Hey", "▁how", "▁are", "▁you", "▁doing"]) self.assertEqual(slow_.tokenize(inputs), ["▁Hey", "▁how", "▁are", "▁you", "▁doing"])

View File

@ -18,12 +18,13 @@ import itertools
import json import json
import os import os
import unittest import unittest
from functools import lru_cache
from transformers import AddedToken, LongformerTokenizer, LongformerTokenizerFast from transformers import AddedToken, LongformerTokenizer, LongformerTokenizerFast
from transformers.models.longformer.tokenization_longformer import VOCAB_FILES_NAMES from transformers.models.longformer.tokenization_longformer import VOCAB_FILES_NAMES
from transformers.testing_utils import require_tokenizers, slow from transformers.testing_utils import require_tokenizers, slow
from ...test_tokenization_common import TokenizerTesterMixin from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
@require_tokenizers @require_tokenizers
@ -36,8 +37,9 @@ class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
rust_tokenizer_class = LongformerTokenizerFast rust_tokenizer_class = LongformerTokenizerFast
test_rust_tokenizer = True test_rust_tokenizer = True
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
vocab = [ vocab = [
@ -64,22 +66,30 @@ class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
] ]
vocab_tokens = dict(zip(vocab, range(len(vocab)))) vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""] merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
self.special_tokens_map = {"unk_token": "<unk>"} cls.special_tokens_map = {"unk_token": "<unk>"}
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(self.vocab_file, "w", encoding="utf-8") as fp: with open(cls.vocab_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(vocab_tokens) + "\n") fp.write(json.dumps(vocab_tokens) + "\n")
with open(self.merges_file, "w", encoding="utf-8") as fp: with open(cls.merges_file, "w", encoding="utf-8") as fp:
fp.write("\n".join(merges)) fp.write("\n".join(merges))
def get_tokenizer(self, **kwargs): @classmethod
kwargs.update(self.special_tokens_map) @use_cache_if_possible
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) @lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
def get_rust_tokenizer(self, **kwargs): @classmethod
kwargs.update(self.special_tokens_map) @use_cache_if_possible
return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) @lru_cache(maxsize=64)
def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return cls.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
def get_input_output_texts(self, tokenizer): def get_input_output_texts(self, tokenizer):
input_text = "lower newer" input_text = "lower newer"
@ -173,8 +183,8 @@ class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_embeded_special_tokens(self): def test_embeded_special_tokens(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
sentence = "A, <mask> AllenNLP sentence." sentence = "A, <mask> AllenNLP sentence."
tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True) tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True) tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
@ -204,7 +214,7 @@ class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_change_add_prefix_space_and_trim_offsets_args(self): def test_change_add_prefix_space_and_trim_offsets_args(self):
for trim_offsets, add_prefix_space in itertools.product([True, False], repeat=2): for trim_offsets, add_prefix_space in itertools.product([True, False], repeat=2):
tokenizer_r = self.rust_tokenizer_class.from_pretrained( tokenizer_r = self.get_rust_tokenizer(
self.tmpdirname, use_fast=True, add_prefix_space=add_prefix_space, trim_offsets=trim_offsets self.tmpdirname, use_fast=True, add_prefix_space=add_prefix_space, trim_offsets=trim_offsets
) )
@ -224,7 +234,7 @@ class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
text_of_1_token = "hello" # `hello` is a token in the vocabulary of `pretrained_name` text_of_1_token = "hello" # `hello` is a token in the vocabulary of `pretrained_name`
text = f"{text_of_1_token} {text_of_1_token}" text = f"{text_of_1_token} {text_of_1_token}"
tokenizer_r = self.rust_tokenizer_class.from_pretrained( tokenizer_r = self.get_rust_tokenizer(
pretrained_name, use_fast=True, add_prefix_space=True, trim_offsets=True pretrained_name, use_fast=True, add_prefix_space=True, trim_offsets=True
) )
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False) encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
@ -234,7 +244,7 @@ class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
(len(text_of_1_token) + 1, len(text_of_1_token) + 1 + len(text_of_1_token)), (len(text_of_1_token) + 1, len(text_of_1_token) + 1 + len(text_of_1_token)),
) )
tokenizer_r = self.rust_tokenizer_class.from_pretrained( tokenizer_r = self.get_rust_tokenizer(
pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=True pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=True
) )
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False) encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
@ -244,7 +254,7 @@ class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
(len(text_of_1_token) + 1, len(text_of_1_token) + 1 + len(text_of_1_token)), (len(text_of_1_token) + 1, len(text_of_1_token) + 1 + len(text_of_1_token)),
) )
tokenizer_r = self.rust_tokenizer_class.from_pretrained( tokenizer_r = self.get_rust_tokenizer(
pretrained_name, use_fast=True, add_prefix_space=True, trim_offsets=False pretrained_name, use_fast=True, add_prefix_space=True, trim_offsets=False
) )
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False) encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
@ -254,7 +264,7 @@ class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
(len(text_of_1_token), len(text_of_1_token) + 1 + len(text_of_1_token)), (len(text_of_1_token), len(text_of_1_token) + 1 + len(text_of_1_token)),
) )
tokenizer_r = self.rust_tokenizer_class.from_pretrained( tokenizer_r = self.get_rust_tokenizer(
pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=False pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=False
) )
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False) encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
@ -276,7 +286,7 @@ class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
# (1 + len(text_of_1_token) + 1, 1 + len(text_of_1_token) + 1 + len(text_of_1_token)), # (1 + len(text_of_1_token) + 1, 1 + len(text_of_1_token) + 1 + len(text_of_1_token)),
# ) # )
tokenizer_r = self.rust_tokenizer_class.from_pretrained( tokenizer_r = self.get_rust_tokenizer(
pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=True pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=True
) )
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False) encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
@ -286,7 +296,7 @@ class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
(1 + len(text_of_1_token) + 1, 1 + len(text_of_1_token) + 1 + len(text_of_1_token)), (1 + len(text_of_1_token) + 1, 1 + len(text_of_1_token) + 1 + len(text_of_1_token)),
) )
tokenizer_r = self.rust_tokenizer_class.from_pretrained( tokenizer_r = self.get_rust_tokenizer(
pretrained_name, use_fast=True, add_prefix_space=True, trim_offsets=False pretrained_name, use_fast=True, add_prefix_space=True, trim_offsets=False
) )
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False) encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
@ -296,7 +306,7 @@ class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
(1 + len(text_of_1_token), 1 + len(text_of_1_token) + 1 + len(text_of_1_token)), (1 + len(text_of_1_token), 1 + len(text_of_1_token) + 1 + len(text_of_1_token)),
) )
tokenizer_r = self.rust_tokenizer_class.from_pretrained( tokenizer_r = self.get_rust_tokenizer(
pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=False pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=False
) )
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False) encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)

View File

@ -14,12 +14,13 @@
# limitations under the License. # limitations under the License.
import unittest import unittest
from functools import lru_cache
from typing import Tuple from typing import Tuple
from transformers import AddedToken, LukeTokenizer from transformers import AddedToken, LukeTokenizer
from transformers.testing_utils import get_tests_dir, require_torch, slow from transformers.testing_utils import get_tests_dir, require_torch, slow
from ...test_tokenization_common import TokenizerTesterMixin from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
SAMPLE_VOCAB = get_tests_dir("fixtures/vocab.json") SAMPLE_VOCAB = get_tests_dir("fixtures/vocab.json")
@ -33,13 +34,17 @@ class LukeTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = False test_rust_tokenizer = False
from_pretrained_kwargs = {"cls_token": "<s>"} from_pretrained_kwargs = {"cls_token": "<s>"}
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
self.special_tokens_map = {"entity_token_1": "<ent>", "entity_token_2": "<ent2>"} cls.special_tokens_map = {"entity_token_1": "<ent>", "entity_token_2": "<ent2>"}
def get_tokenizer(self, task=None, **kwargs): @classmethod
kwargs.update(self.special_tokens_map) @use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, task=None, **kwargs):
kwargs.update(cls.special_tokens_map)
tokenizer = LukeTokenizer( tokenizer = LukeTokenizer(
vocab_file=SAMPLE_VOCAB, vocab_file=SAMPLE_VOCAB,
merges_file=SAMPLE_MERGE_FILE, merges_file=SAMPLE_MERGE_FILE,
@ -137,8 +142,8 @@ class LukeTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
def test_embeded_special_tokens(self): def test_embeded_special_tokens(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)): with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
sentence = "A, <mask> AllenNLP sentence." sentence = "A, <mask> AllenNLP sentence."
tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True) tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True) tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)

View File

@ -32,8 +32,9 @@ class LxmertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = True test_rust_tokenizer = True
space_between_special_tokens = True space_between_special_tokens = True
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
vocab_tokens = [ vocab_tokens = [
"[UNK]", "[UNK]",
@ -50,8 +51,8 @@ class LxmertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
"low", "low",
"lowest", "lowest",
] ]
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
def get_input_output_texts(self, tokenizer): def get_input_output_texts(self, tokenizer):

View File

@ -14,6 +14,7 @@
import tempfile import tempfile
import unittest import unittest
from functools import lru_cache
from pathlib import Path from pathlib import Path
from shutil import copyfile from shutil import copyfile
@ -32,7 +33,7 @@ from transformers.utils import is_sentencepiece_available
if is_sentencepiece_available(): if is_sentencepiece_available():
from transformers.models.m2m_100.tokenization_m2m_100 import VOCAB_FILES_NAMES, save_json from transformers.models.m2m_100.tokenization_m2m_100 import VOCAB_FILES_NAMES, save_json
from ...test_tokenization_common import TokenizerTesterMixin from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
if is_sentencepiece_available(): if is_sentencepiece_available():
@ -54,21 +55,26 @@ class M2M100TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_seq2seq = False test_seq2seq = False
test_sentencepiece = True test_sentencepiece = True
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
vocab = ["</s>", "<unk>", "▁This", "▁is", "▁a", "▁t", "est", "\u0120", "<pad>"] vocab = ["</s>", "<unk>", "▁This", "▁is", "▁a", "▁t", "est", "\u0120", "<pad>"]
vocab_tokens = dict(zip(vocab, range(len(vocab)))) vocab_tokens = dict(zip(vocab, range(len(vocab))))
save_dir = Path(self.tmpdirname) save_dir = Path(cls.tmpdirname)
save_json(vocab_tokens, save_dir / VOCAB_FILES_NAMES["vocab_file"]) save_json(vocab_tokens, save_dir / VOCAB_FILES_NAMES["vocab_file"])
if not (save_dir / VOCAB_FILES_NAMES["spm_file"]).exists(): if not (save_dir / VOCAB_FILES_NAMES["spm_file"]).exists():
copyfile(SAMPLE_SP, save_dir / VOCAB_FILES_NAMES["spm_file"]) copyfile(SAMPLE_SP, save_dir / VOCAB_FILES_NAMES["spm_file"])
tokenizer = M2M100Tokenizer.from_pretrained(self.tmpdirname) tokenizer = M2M100Tokenizer.from_pretrained(cls.tmpdirname)
tokenizer.save_pretrained(self.tmpdirname) tokenizer.save_pretrained(cls.tmpdirname)
def get_tokenizer(self, **kwargs): @classmethod
return M2M100Tokenizer.from_pretrained(self.tmpdirname, **kwargs) @use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs):
pretrained_name = pretrained_name or cls.tmpdirname
return M2M100Tokenizer.from_pretrained(pretrained_name, **kwargs)
def get_input_output_texts(self, tokenizer): def get_input_output_texts(self, tokenizer):
return ( return (

View File

@ -15,6 +15,7 @@
import tempfile import tempfile
import unittest import unittest
from functools import lru_cache
from pathlib import Path from pathlib import Path
from shutil import copyfile from shutil import copyfile
@ -26,7 +27,7 @@ from transformers.utils import is_sentencepiece_available, is_tf_available, is_t
if is_sentencepiece_available(): if is_sentencepiece_available():
from transformers.models.marian.tokenization_marian import VOCAB_FILES_NAMES, save_json from transformers.models.marian.tokenization_marian import VOCAB_FILES_NAMES, save_json
from ...test_tokenization_common import TokenizerTesterMixin from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
SAMPLE_SP = get_tests_dir("fixtures/test_sentencepiece.model") SAMPLE_SP = get_tests_dir("fixtures/test_sentencepiece.model")
@ -50,22 +51,28 @@ class MarianTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = False test_rust_tokenizer = False
test_sentencepiece = True test_sentencepiece = True
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
vocab = ["</s>", "<unk>", "▁This", "▁is", "▁a", "▁t", "est", "\u0120", "<pad>"] vocab = ["</s>", "<unk>", "▁This", "▁is", "▁a", "▁t", "est", "\u0120", "<pad>"]
vocab_tokens = dict(zip(vocab, range(len(vocab)))) vocab_tokens = dict(zip(vocab, range(len(vocab))))
save_dir = Path(self.tmpdirname) save_dir = Path(cls.tmpdirname)
save_json(vocab_tokens, save_dir / VOCAB_FILES_NAMES["vocab"]) save_json(vocab_tokens, save_dir / VOCAB_FILES_NAMES["vocab"])
save_json(mock_tokenizer_config, save_dir / VOCAB_FILES_NAMES["tokenizer_config_file"]) save_json(mock_tokenizer_config, save_dir / VOCAB_FILES_NAMES["tokenizer_config_file"])
if not (save_dir / VOCAB_FILES_NAMES["source_spm"]).exists(): if not (save_dir / VOCAB_FILES_NAMES["source_spm"]).exists():
copyfile(SAMPLE_SP, save_dir / VOCAB_FILES_NAMES["source_spm"]) copyfile(SAMPLE_SP, save_dir / VOCAB_FILES_NAMES["source_spm"])
copyfile(SAMPLE_SP, save_dir / VOCAB_FILES_NAMES["target_spm"]) copyfile(SAMPLE_SP, save_dir / VOCAB_FILES_NAMES["target_spm"])
tokenizer = MarianTokenizer.from_pretrained(self.tmpdirname) tokenizer = MarianTokenizer.from_pretrained(cls.tmpdirname)
tokenizer.save_pretrained(self.tmpdirname) tokenizer.save_pretrained(cls.tmpdirname)
def get_tokenizer(self, **kwargs) -> MarianTokenizer: @classmethod
return MarianTokenizer.from_pretrained(self.tmpdirname, **kwargs) @use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs) -> MarianTokenizer:
pretrained_name = pretrained_name or cls.tmpdirname
return MarianTokenizer.from_pretrained(pretrained_name, **kwargs)
def get_input_output_texts(self, tokenizer): def get_input_output_texts(self, tokenizer):
return ( return (

View File

@ -50,26 +50,27 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_kwargs = {"cls_token": "<s>"} from_pretrained_kwargs = {"cls_token": "<s>"}
test_seq2seq = False test_seq2seq = False
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "\u0120", "\u0120l", "\u0120n", "\u0120lo", "\u0120low", "er", "\u0120lowest", "\u0120newer", "\u0120wider", "\u0120hello", "\u0120world", "<unk>",] # fmt: skip vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "\u0120", "\u0120l", "\u0120n", "\u0120lo", "\u0120low", "er", "\u0120lowest", "\u0120newer", "\u0120wider", "\u0120hello", "\u0120world", "<unk>",] # fmt: skip
vocab_tokens = dict(zip(vocab, range(len(vocab)))) vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""] merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
self.tags_dict = {"a": 0, "abbr": 1, "acronym": 2, "address": 3} cls.tags_dict = {"a": 0, "abbr": 1, "acronym": 2, "address": 3}
self.special_tokens_map = {"unk_token": "<unk>"} cls.special_tokens_map = {"unk_token": "<unk>"}
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
self.tokenizer_config_file = os.path.join(self.tmpdirname, "tokenizer_config.json") cls.tokenizer_config_file = os.path.join(cls.tmpdirname, "tokenizer_config.json")
with open(self.vocab_file, "w", encoding="utf-8") as fp: with open(cls.vocab_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(vocab_tokens) + "\n") fp.write(json.dumps(vocab_tokens) + "\n")
with open(self.merges_file, "w", encoding="utf-8") as fp: with open(cls.merges_file, "w", encoding="utf-8") as fp:
fp.write("\n".join(merges)) fp.write("\n".join(merges))
with open(self.tokenizer_config_file, "w", encoding="utf-8") as fp: with open(cls.tokenizer_config_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps({"tags_dict": self.tags_dict})) fp.write(json.dumps({"tags_dict": cls.tags_dict}))
def get_nodes_and_xpaths(self): def get_nodes_and_xpaths(self):
nodes = ["hello", "world"] nodes = ["hello", "world"]
@ -421,8 +422,8 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_padding(self, max_length=50): def test_padding(self, max_length=50):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id) self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
pad_token_id = tokenizer_p.pad_token_id pad_token_id = tokenizer_p.pad_token_id
@ -828,8 +829,8 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
# Input tokens id # Input tokens id
nodes, xpaths = self.get_nodes_and_xpaths() nodes, xpaths = self.get_nodes_and_xpaths()
@ -1010,7 +1011,7 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_offsets_mapping(self): def test_offsets_mapping(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
text = ["a", "wonderful", "test"] text = ["a", "wonderful", "test"]
xpaths = ["html/body" for _ in range(len(text))] xpaths = ["html/body" for _ in range(len(text))]
@ -1125,7 +1126,7 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
nodes, xpaths = self.get_nodes_and_xpaths() nodes, xpaths = self.get_nodes_and_xpaths()
@ -1187,7 +1188,7 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
nodes, xpaths = self.get_nodes_and_xpaths() nodes, xpaths = self.get_nodes_and_xpaths()
tokens_r = tokenizer_r.encode_plus(nodes, xpaths=xpaths, add_special_tokens=True) tokens_r = tokenizer_r.encode_plus(nodes, xpaths=xpaths, add_special_tokens=True)
tokens_p = tokenizer_p.encode_plus(nodes, xpaths=xpaths, add_special_tokens=True) tokens_p = tokenizer_p.encode_plus(nodes, xpaths=xpaths, add_special_tokens=True)
@ -1490,7 +1491,7 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id) self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
pad_token_id = tokenizer_p.pad_token_id pad_token_id = tokenizer_p.pad_token_id

View File

@ -47,12 +47,13 @@ class MBartTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = True test_rust_tokenizer = True
test_sentencepiece = True test_sentencepiece = True
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
# We have a SentencePiece fixture for testing # We have a SentencePiece fixture for testing
tokenizer = MBartTokenizer(SAMPLE_VOCAB, keep_accents=True) tokenizer = MBartTokenizer(SAMPLE_VOCAB, keep_accents=True)
tokenizer.save_pretrained(self.tmpdirname) tokenizer.save_pretrained(cls.tmpdirname)
def test_full_tokenizer(self): def test_full_tokenizer(self):
tokenizer = MBartTokenizer(SAMPLE_VOCAB, keep_accents=True) tokenizer = MBartTokenizer(SAMPLE_VOCAB, keep_accents=True)
@ -139,8 +140,8 @@ class MBartTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.tokenizers_list[0] = (self.rust_tokenizer_class, "hf-internal-testing/tiny-random-mbart", {}) self.tokenizers_list[0] = (self.rust_tokenizer_class, "hf-internal-testing/tiny-random-mbart", {})
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
tmpdirname2 = tempfile.mkdtemp() tmpdirname2 = tempfile.mkdtemp()

View File

@ -47,12 +47,13 @@ class MBart50TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = True test_rust_tokenizer = True
test_sentencepiece = True test_sentencepiece = True
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
# We have a SentencePiece fixture for testing # We have a SentencePiece fixture for testing
tokenizer = MBart50Tokenizer(SAMPLE_VOCAB, src_lang="en_XX", tgt_lang="ro_RO", keep_accents=True) tokenizer = MBart50Tokenizer(SAMPLE_VOCAB, src_lang="en_XX", tgt_lang="ro_RO", keep_accents=True)
tokenizer.save_pretrained(self.tmpdirname) tokenizer.save_pretrained(cls.tmpdirname)
def test_convert_token_and_id(self): def test_convert_token_and_id(self):
"""Test ``_convert_token_to_id`` and ``_convert_id_to_token``.""" """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
@ -117,8 +118,8 @@ class MBart50TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.tokenizers_list[0] = (self.rust_tokenizer_class, "hf-internal-testing/tiny-random-mbart50", {}) self.tokenizers_list[0] = (self.rust_tokenizer_class, "hf-internal-testing/tiny-random-mbart50", {})
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
tmpdirname2 = tempfile.mkdtemp() tmpdirname2 = tempfile.mkdtemp()

View File

@ -17,12 +17,13 @@
import json import json
import os import os
import unittest import unittest
from functools import lru_cache
from transformers import MgpstrTokenizer from transformers import MgpstrTokenizer
from transformers.models.mgp_str.tokenization_mgp_str import VOCAB_FILES_NAMES from transformers.models.mgp_str.tokenization_mgp_str import VOCAB_FILES_NAMES
from transformers.testing_utils import require_tokenizers from transformers.testing_utils import require_tokenizers
from ...test_tokenization_common import TokenizerTesterMixin from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
@require_tokenizers @require_tokenizers
@ -33,18 +34,23 @@ class MgpstrTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_kwargs = {} from_pretrained_kwargs = {}
test_seq2seq = False test_seq2seq = False
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
vocab = ['[GO]', '[s]', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'] # fmt: skip vocab = ['[GO]', '[s]', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'] # fmt: skip
vocab_tokens = dict(zip(vocab, range(len(vocab)))) vocab_tokens = dict(zip(vocab, range(len(vocab))))
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(self.vocab_file, "w", encoding="utf-8") as fp: with open(cls.vocab_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(vocab_tokens) + "\n") fp.write(json.dumps(vocab_tokens) + "\n")
def get_tokenizer(self, **kwargs): @classmethod
return MgpstrTokenizer.from_pretrained(self.tmpdirname, **kwargs) @use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs):
pretrained_name = pretrained_name or cls.tmpdirname
return MgpstrTokenizer.from_pretrained(pretrained_name, **kwargs)
def get_input_output_texts(self, tokenizer): def get_input_output_texts(self, tokenizer):
input_text = "tester" input_text = "tester"

View File

@ -15,12 +15,13 @@
import unittest import unittest
from functools import lru_cache
from typing import Tuple from typing import Tuple
from transformers.models.mluke.tokenization_mluke import MLukeTokenizer from transformers.models.mluke.tokenization_mluke import MLukeTokenizer
from transformers.testing_utils import get_tests_dir, require_torch, slow from transformers.testing_utils import get_tests_dir, require_torch, slow
from ...test_tokenization_common import TokenizerTesterMixin from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
@ -33,13 +34,17 @@ class MLukeTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = False test_rust_tokenizer = False
from_pretrained_kwargs = {"cls_token": "<s>"} from_pretrained_kwargs = {"cls_token": "<s>"}
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
self.special_tokens_map = {"entity_token_1": "<ent>", "entity_token_2": "<ent2>"} cls.special_tokens_map = {"entity_token_1": "<ent>", "entity_token_2": "<ent2>"}
def get_tokenizer(self, task=None, **kwargs): @classmethod
kwargs.update(self.special_tokens_map) @use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, task=None, **kwargs):
kwargs.update(cls.special_tokens_map)
kwargs.update({"task": task}) kwargs.update({"task": task})
tokenizer = MLukeTokenizer(vocab_file=SAMPLE_VOCAB, entity_vocab_file=SAMPLE_ENTITY_VOCAB, **kwargs) tokenizer = MLukeTokenizer(vocab_file=SAMPLE_VOCAB, entity_vocab_file=SAMPLE_ENTITY_VOCAB, **kwargs)
return tokenizer return tokenizer
@ -100,8 +105,8 @@ class MLukeTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
def test_embeded_special_tokens(self): def test_embeded_special_tokens(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)): with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
sentence = "A, <mask> AllenNLP sentence." sentence = "A, <mask> AllenNLP sentence."
tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True) tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True) tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)

View File

@ -41,8 +41,9 @@ class MobileBERTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_filter = filter_non_english from_pretrained_filter = filter_non_english
pre_trained_model_path = "google/mobilebert-uncased" pre_trained_model_path = "google/mobilebert-uncased"
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
vocab_tokens = [ vocab_tokens = [
"[UNK]", "[UNK]",
@ -61,13 +62,13 @@ class MobileBERTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
"low", "low",
"lowest", "lowest",
] ]
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
self.tokenizers_list = [ cls.tokenizers_list = [
(tokenizer_def[0], self.pre_trained_model_path, tokenizer_def[2]) # else the 'google/' prefix is stripped (tokenizer_def[0], cls.pre_trained_model_path, tokenizer_def[2]) # else the 'google/' prefix is stripped
for tokenizer_def in self.tokenizers_list for tokenizer_def in cls.tokenizers_list
] ]
# Copied from tests.models.bert.test_tokenization_bert.BertTokenizationTest.get_input_output_texts # Copied from tests.models.bert.test_tokenization_bert.BertTokenizationTest.get_input_output_texts
@ -275,7 +276,7 @@ class MobileBERTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_offsets_with_special_characters(self): def test_offsets_with_special_characters(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence." sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence."
tokens = tokenizer_r.encode_plus( tokens = tokenizer_r.encode_plus(
@ -331,8 +332,8 @@ class MobileBERTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
kwargs["tokenize_chinese_chars"] = True kwargs["tokenize_chinese_chars"] = True
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False) ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)
ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False) ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
@ -345,8 +346,8 @@ class MobileBERTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertListEqual(tokens_without_spe_char_r, list_of_commun_chinese_char) self.assertListEqual(tokens_without_spe_char_r, list_of_commun_chinese_char)
kwargs["tokenize_chinese_chars"] = False kwargs["tokenize_chinese_chars"] = False
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False) ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False) ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)

View File

@ -51,8 +51,9 @@ class MoshiTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = True test_rust_tokenizer = True
from_pretrained_kwargs = {} from_pretrained_kwargs = {}
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
# We have a SentencePiece fixture for testing # We have a SentencePiece fixture for testing
tokenizer = PreTrainedTokenizerFast( tokenizer = PreTrainedTokenizerFast(
@ -62,10 +63,11 @@ class MoshiTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
eos_token="</s>", eos_token="</s>",
) )
tokenizer.pad_token = tokenizer.eos_token tokenizer.pad_token = tokenizer.eos_token
tokenizer.save_pretrained(self.tmpdirname) tokenizer.save_pretrained(cls.tmpdirname)
def get_rust_tokenizer(self, **kwargs) -> PreTrainedTokenizerFast: def get_rust_tokenizer(cls, pretrained_name=None, **kwargs) -> PreTrainedTokenizerFast:
return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) pretrained_name = pretrained_name or cls.tmpdirname
return cls.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
@unittest.skip(reason="No slow tokenizer") @unittest.skip(reason="No slow tokenizer")
def test_added_tokens_serialization(self): def test_added_tokens_serialization(self):

View File

@ -32,8 +32,9 @@ class MPNetTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = True test_rust_tokenizer = True
space_between_special_tokens = True space_between_special_tokens = True
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
vocab_tokens = [ vocab_tokens = [
"[UNK]", "[UNK]",
@ -52,8 +53,8 @@ class MPNetTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
"low", "low",
"lowest", "lowest",
] ]
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
def get_input_output_texts(self, tokenizer): def get_input_output_texts(self, tokenizer):

View File

@ -14,13 +14,14 @@
import json import json
import os import os
import unittest import unittest
from functools import lru_cache
from transformers import BatchEncoding, MvpTokenizer, MvpTokenizerFast from transformers import BatchEncoding, MvpTokenizer, MvpTokenizerFast
from transformers.models.roberta.tokenization_roberta import VOCAB_FILES_NAMES from transformers.models.roberta.tokenization_roberta import VOCAB_FILES_NAMES
from transformers.testing_utils import require_tokenizers, require_torch from transformers.testing_utils import require_tokenizers, require_torch
from transformers.utils import cached_property from transformers.utils import cached_property
from ...test_tokenization_common import TokenizerTesterMixin, filter_roberta_detectors from ...test_tokenization_common import TokenizerTesterMixin, filter_roberta_detectors, use_cache_if_possible
@require_tokenizers @require_tokenizers
@ -32,8 +33,10 @@ class TestTokenizationMvp(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_filter = filter_roberta_detectors from_pretrained_filter = filter_roberta_detectors
# from_pretrained_kwargs = {'add_prefix_space': True} # from_pretrained_kwargs = {'add_prefix_space': True}
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
vocab = [ vocab = [
"l", "l",
"o", "o",
@ -58,22 +61,30 @@ class TestTokenizationMvp(TokenizerTesterMixin, unittest.TestCase):
] ]
vocab_tokens = dict(zip(vocab, range(len(vocab)))) vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""] merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
self.special_tokens_map = {"unk_token": "<unk>"} cls.special_tokens_map = {"unk_token": "<unk>"}
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(self.vocab_file, "w", encoding="utf-8") as fp: with open(cls.vocab_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(vocab_tokens) + "\n") fp.write(json.dumps(vocab_tokens) + "\n")
with open(self.merges_file, "w", encoding="utf-8") as fp: with open(cls.merges_file, "w", encoding="utf-8") as fp:
fp.write("\n".join(merges)) fp.write("\n".join(merges))
def get_tokenizer(self, **kwargs): @classmethod
kwargs.update(self.special_tokens_map) @use_cache_if_possible
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) @lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
def get_rust_tokenizer(self, **kwargs): @classmethod
kwargs.update(self.special_tokens_map) @use_cache_if_possible
return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) @lru_cache(maxsize=64)
def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return cls.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
def get_input_output_texts(self, tokenizer): def get_input_output_texts(self, tokenizer):
return "lower newer", "lower newer" return "lower newer", "lower newer"
@ -153,8 +164,8 @@ class TestTokenizationMvp(TokenizerTesterMixin, unittest.TestCase):
def test_embeded_special_tokens(self): def test_embeded_special_tokens(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
sentence = "A, <mask> AllenNLP sentence." sentence = "A, <mask> AllenNLP sentence."
tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True) tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True) tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)

View File

@ -16,6 +16,7 @@ import binascii
import unittest import unittest
from transformers import MyT5Tokenizer from transformers import MyT5Tokenizer
from transformers.testing_utils import slow
from transformers.utils import is_tf_available, is_torch_available from transformers.utils import is_tf_available, is_torch_available
from ...test_tokenization_common import TokenizerTesterMixin from ...test_tokenization_common import TokenizerTesterMixin
@ -86,15 +87,14 @@ class TestByteRewriter(unittest.TestCase):
self.assertEqual(decompose_rewriter.rewrite_bytes(in_hex), out_hex) self.assertEqual(decompose_rewriter.rewrite_bytes(in_hex), out_hex)
# This is way too slow, let's not run it on CircleCI. When trying to use cache, we get OOM and worker(s) crashed.
@slow
class MyT5TokenizationTest(TokenizerTesterMixin, unittest.TestCase): class MyT5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = MyT5Tokenizer tokenizer_class = MyT5Tokenizer
test_rust_tokenizer = False test_rust_tokenizer = False
def setUp(self): def get_tokenizer(cls, **kwargs) -> MyT5Tokenizer:
super().setUp() return cls.tokenizer_class.from_pretrained("Tomlim/myt5-base", **kwargs)
def get_tokenizer(self, **kwargs) -> MyT5Tokenizer:
return self.tokenizer_class.from_pretrained("Tomlim/myt5-base", **kwargs)
@unittest.skip(reason="inputs cannot be pretokenized as ids depend on whole input string") @unittest.skip(reason="inputs cannot be pretokenized as ids depend on whole input string")
def test_pretokenized_inputs(self): def test_pretokenized_inputs(self):

View File

@ -56,12 +56,13 @@ class NllbTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_sentencepiece = True test_sentencepiece = True
from_pretrained_kwargs = {} from_pretrained_kwargs = {}
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
# We have a SentencePiece fixture for testing # We have a SentencePiece fixture for testing
tokenizer = NllbTokenizer(SAMPLE_VOCAB, keep_accents=True) tokenizer = NllbTokenizer(SAMPLE_VOCAB, keep_accents=True)
tokenizer.save_pretrained(self.tmpdirname) tokenizer.save_pretrained(cls.tmpdirname)
def test_full_tokenizer(self): def test_full_tokenizer(self):
tokenizer = NllbTokenizer(SAMPLE_VOCAB, keep_accents=True) tokenizer = NllbTokenizer(SAMPLE_VOCAB, keep_accents=True)
@ -143,8 +144,8 @@ class NllbTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.tokenizers_list[0] = (self.rust_tokenizer_class, "hf-internal-testing/tiny-random-nllb", {}) self.tokenizers_list[0] = (self.rust_tokenizer_class, "hf-internal-testing/tiny-random-nllb", {})
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
tmpdirname2 = tempfile.mkdtemp() tmpdirname2 = tempfile.mkdtemp()
@ -262,7 +263,7 @@ class NllbTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
added_tokens = [AddedToken("<special>", lstrip=True)] added_tokens = [AddedToken("<special>", lstrip=True)]
tokenizer_r = self.rust_tokenizer_class.from_pretrained( tokenizer_r = self.get_rust_tokenizer(
pretrained_name, additional_special_tokens=added_tokens, **kwargs pretrained_name, additional_special_tokens=added_tokens, **kwargs
) )
r_output = tokenizer_r.encode("Hey this is a <special> token") r_output = tokenizer_r.encode("Hey this is a <special> token")
@ -272,7 +273,7 @@ class NllbTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertTrue(special_token_id in r_output) self.assertTrue(special_token_id in r_output)
if self.test_slow_tokenizer: if self.test_slow_tokenizer:
tokenizer_cr = self.rust_tokenizer_class.from_pretrained( tokenizer_cr = self.get_rust_tokenizer(
pretrained_name, pretrained_name,
additional_special_tokens=added_tokens, additional_special_tokens=added_tokens,
**kwargs, # , from_slow=True <- unfortunately too slow to convert **kwargs, # , from_slow=True <- unfortunately too slow to convert

View File

@ -13,13 +13,15 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import copy
import unittest import unittest
from functools import lru_cache
from transformers import NougatTokenizerFast from transformers import NougatTokenizerFast
from transformers.models.nougat.tokenization_nougat_fast import markdown_compatible, normalize_list_like_lines from transformers.models.nougat.tokenization_nougat_fast import markdown_compatible, normalize_list_like_lines
from transformers.testing_utils import require_levenshtein, require_nltk, require_tokenizers from transformers.testing_utils import require_levenshtein, require_nltk, require_tokenizers
from ...test_tokenization_common import TokenizerTesterMixin from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
@require_tokenizers @require_tokenizers
@ -33,19 +35,26 @@ class NougatTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_vocab_key = "tokenizer_file" from_pretrained_vocab_key = "tokenizer_file"
special_tokens_map = {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"} special_tokens_map = {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
tokenizer = NougatTokenizerFast.from_pretrained("facebook/nougat-base") tokenizer = NougatTokenizerFast.from_pretrained("facebook/nougat-base")
tokenizer.save_pretrained(self.tmpdirname) tokenizer.save_pretrained(cls.tmpdirname)
def get_rust_tokenizer(self, **kwargs): @classmethod
kwargs.update(self.special_tokens_map) @use_cache_if_possible
return NougatTokenizerFast.from_pretrained(self.tmpdirname, **kwargs) @lru_cache(maxsize=64)
def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
_kwargs = copy.deepcopy(cls.special_tokens_map)
_kwargs.update(kwargs)
kwargs = _kwargs
pretrained_name = pretrained_name or cls.tmpdirname
return NougatTokenizerFast.from_pretrained(pretrained_name, **kwargs)
def test_padding(self, max_length=6): def test_padding(self, max_length=6):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
# Simple input # Simple input
sentence1 = "This is a simple input" sentence1 = "This is a simple input"
sentence2 = ["This is a simple input 1", "This is a simple input 2"] sentence2 = ["This is a simple input 1", "This is a simple input 2"]

View File

@ -35,8 +35,9 @@ class OpenAIGPTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = True test_rust_tokenizer = True
test_seq2seq = False test_seq2seq = False
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
vocab = [ vocab = [
@ -65,11 +66,11 @@ class OpenAIGPTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
vocab_tokens = dict(zip(vocab, range(len(vocab)))) vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["#version: 0.2", "l o", "lo w", "e r</w>", ""] merges = ["#version: 0.2", "l o", "lo w", "e r</w>", ""]
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(self.vocab_file, "w") as fp: with open(cls.vocab_file, "w") as fp:
fp.write(json.dumps(vocab_tokens)) fp.write(json.dumps(vocab_tokens))
with open(self.merges_file, "w") as fp: with open(cls.merges_file, "w") as fp:
fp.write("\n".join(merges)) fp.write("\n".join(merges))
def get_input_output_texts(self, tokenizer): def get_input_output_texts(self, tokenizer):
@ -90,7 +91,7 @@ class OpenAIGPTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_padding(self, max_length=15): def test_padding(self, max_length=15):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
# Simple input # Simple input
s = "This is a simple input" s = "This is a simple input"

View File

@ -13,12 +13,13 @@
# limitations under the License. # limitations under the License.
import unittest import unittest
from functools import lru_cache
from transformers import PegasusTokenizer, PegasusTokenizerFast from transformers import PegasusTokenizer, PegasusTokenizerFast
from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, require_torch, slow from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, require_torch, slow
from transformers.utils import cached_property from transformers.utils import cached_property
from ...test_tokenization_common import TokenizerTesterMixin from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece_no_bos.model") SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece_no_bos.model")
@ -33,19 +34,24 @@ class PegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = True test_rust_tokenizer = True
test_sentencepiece = True test_sentencepiece = True
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
# We have a SentencePiece fixture for testing # We have a SentencePiece fixture for testing
tokenizer = PegasusTokenizer(SAMPLE_VOCAB) tokenizer = PegasusTokenizer(SAMPLE_VOCAB)
tokenizer.save_pretrained(self.tmpdirname) tokenizer.save_pretrained(cls.tmpdirname)
@cached_property @cached_property
def _large_tokenizer(self): def _large_tokenizer(self):
return PegasusTokenizer.from_pretrained("google/pegasus-large") return PegasusTokenizer.from_pretrained("google/pegasus-large")
def get_tokenizer(self, **kwargs) -> PegasusTokenizer: @classmethod
return PegasusTokenizer.from_pretrained(self.tmpdirname, **kwargs) @use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs) -> PegasusTokenizer:
pretrained_name = pretrained_name or cls.tmpdirname
return PegasusTokenizer.from_pretrained(pretrained_name, **kwargs)
def get_input_output_texts(self, tokenizer): def get_input_output_texts(self, tokenizer):
return ("This is a test", "This is a test") return ("This is a test", "This is a test")
@ -70,8 +76,8 @@ class PegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertEqual(self.get_tokenizer().vocab_size, 1_103) self.assertEqual(self.get_tokenizer().vocab_size, 1_103)
def test_mask_tokens_rust_pegasus(self): def test_mask_tokens_rust_pegasus(self):
rust_tokenizer = self.rust_tokenizer_class.from_pretrained(self.tmpdirname) rust_tokenizer = self.get_rust_tokenizer(self.tmpdirname)
py_tokenizer = self.tokenizer_class.from_pretrained(self.tmpdirname) py_tokenizer = self.get_tokenizer(self.tmpdirname)
raw_input_str = ( raw_input_str = (
"Let's see which <unk> is the better <unk_token_11> one <mask_1> It seems like this <mask_2> was important" "Let's see which <unk> is the better <unk_token_11> one <mask_1> It seems like this <mask_2> was important"
" </s> <pad> <pad> <pad>" " </s> <pad> <pad> <pad>"
@ -138,26 +144,31 @@ class BigBirdPegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = True test_rust_tokenizer = True
test_sentencepiece = True test_sentencepiece = True
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
# We have a SentencePiece fixture for testing # We have a SentencePiece fixture for testing
tokenizer = PegasusTokenizer(SAMPLE_VOCAB, offset=0, mask_token_sent=None, mask_token="[MASK]") tokenizer = PegasusTokenizer(SAMPLE_VOCAB, offset=0, mask_token_sent=None, mask_token="[MASK]")
tokenizer.save_pretrained(self.tmpdirname) tokenizer.save_pretrained(cls.tmpdirname)
@cached_property @cached_property
def _large_tokenizer(self): def _large_tokenizer(self):
return PegasusTokenizer.from_pretrained("google/bigbird-pegasus-large-arxiv") return PegasusTokenizer.from_pretrained("google/bigbird-pegasus-large-arxiv")
def get_tokenizer(self, **kwargs) -> PegasusTokenizer: @classmethod
return PegasusTokenizer.from_pretrained(self.tmpdirname, **kwargs) @use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs) -> PegasusTokenizer:
pretrained_name = pretrained_name or cls.tmpdirname
return PegasusTokenizer.from_pretrained(pretrained_name, **kwargs)
def get_input_output_texts(self, tokenizer): def get_input_output_texts(self, tokenizer):
return ("This is a test", "This is a test") return ("This is a test", "This is a test")
def test_mask_tokens_rust_pegasus(self): def test_mask_tokens_rust_pegasus(self):
rust_tokenizer = self.rust_tokenizer_class.from_pretrained(self.tmpdirname) rust_tokenizer = self.get_rust_tokenizer(self.tmpdirname)
py_tokenizer = self.tokenizer_class.from_pretrained(self.tmpdirname) py_tokenizer = self.get_tokenizer(self.tmpdirname)
raw_input_str = ( raw_input_str = (
"Let's see which <unk> is the better <unk_token> one [MASK] It seems like this [MASK] was important </s>" "Let's see which <unk> is the better <unk_token> one [MASK] It seems like this [MASK] was important </s>"
" <pad> <pad> <pad>" " <pad> <pad> <pad>"

View File

@ -19,12 +19,13 @@ import re
import shutil import shutil
import tempfile import tempfile
import unittest import unittest
from functools import lru_cache
from typing import Tuple from typing import Tuple
from transformers import AddedToken, BatchEncoding, PerceiverTokenizer from transformers import AddedToken, BatchEncoding, PerceiverTokenizer
from transformers.utils import cached_property, is_tf_available, is_torch_available from transformers.utils import cached_property, is_tf_available, is_torch_available
from ...test_tokenization_common import TokenizerTesterMixin from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
if is_torch_available(): if is_torch_available():
@ -40,17 +41,22 @@ class PerceiverTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = PerceiverTokenizer tokenizer_class = PerceiverTokenizer
test_rust_tokenizer = False test_rust_tokenizer = False
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
tokenizer = PerceiverTokenizer() tokenizer = PerceiverTokenizer()
tokenizer.save_pretrained(self.tmpdirname) tokenizer.save_pretrained(cls.tmpdirname)
@cached_property @cached_property
def perceiver_tokenizer(self): def perceiver_tokenizer(self):
return PerceiverTokenizer.from_pretrained("deepmind/language-perceiver") return PerceiverTokenizer.from_pretrained("deepmind/language-perceiver")
def get_tokenizer(self, **kwargs) -> PerceiverTokenizer: @classmethod
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) @use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs) -> PerceiverTokenizer:
pretrained_name = pretrained_name or cls.tmpdirname
return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
def get_clean_sequence(self, tokenizer, with_prefix_space=False, max_length=20, min_length=5) -> Tuple[str, list]: def get_clean_sequence(self, tokenizer, with_prefix_space=False, max_length=20, min_length=5) -> Tuple[str, list]:
# XXX The default common tokenizer tests assume that every ID is decodable on its own. # XXX The default common tokenizer tests assume that every ID is decodable on its own.

View File

@ -15,10 +15,11 @@
import os import os
import unittest import unittest
from functools import lru_cache
from transformers.models.phobert.tokenization_phobert import VOCAB_FILES_NAMES, PhobertTokenizer from transformers.models.phobert.tokenization_phobert import VOCAB_FILES_NAMES, PhobertTokenizer
from ...test_tokenization_common import TokenizerTesterMixin from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
class PhobertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class PhobertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
@ -26,27 +27,32 @@ class PhobertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = PhobertTokenizer tokenizer_class = PhobertTokenizer
test_rust_tokenizer = False test_rust_tokenizer = False
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
vocab = ["T@@", "i", "I", "R@@", "r", "e@@"] vocab = ["T@@", "i", "I", "R@@", "r", "e@@"]
vocab_tokens = dict(zip(vocab, range(len(vocab)))) vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["#version: 0.2", "l à</w>"] merges = ["#version: 0.2", "l à</w>"]
self.special_tokens_map = {"unk_token": "<unk>"} cls.special_tokens_map = {"unk_token": "<unk>"}
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(self.vocab_file, "w", encoding="utf-8") as fp: with open(cls.vocab_file, "w", encoding="utf-8") as fp:
for token in vocab_tokens: for token in vocab_tokens:
fp.write(f"{token} {vocab_tokens[token]}\n") fp.write(f"{token} {vocab_tokens[token]}\n")
with open(self.merges_file, "w", encoding="utf-8") as fp: with open(cls.merges_file, "w", encoding="utf-8") as fp:
fp.write("\n".join(merges)) fp.write("\n".join(merges))
def get_tokenizer(self, **kwargs): @classmethod
kwargs.update(self.special_tokens_map) @use_cache_if_possible
return PhobertTokenizer.from_pretrained(self.tmpdirname, **kwargs) @lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return PhobertTokenizer.from_pretrained(pretrained_name, **kwargs)
def get_input_output_texts(self, tokenizer): def get_input_output_texts(self, tokenizer):
input_text = "Tôi là VinAI Research" input_text = "Tôi là VinAI Research"

View File

@ -45,12 +45,13 @@ class PLBartTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
rust_tokenizer_class = None rust_tokenizer_class = None
test_rust_tokenizer = False test_rust_tokenizer = False
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
# We have a SentencePiece fixture for testing # We have a SentencePiece fixture for testing
tokenizer = PLBartTokenizer(SAMPLE_VOCAB, language_codes="base", keep_accents=True) tokenizer = PLBartTokenizer(SAMPLE_VOCAB, language_codes="base", keep_accents=True)
tokenizer.save_pretrained(self.tmpdirname) tokenizer.save_pretrained(cls.tmpdirname)
def test_full_base_tokenizer(self): def test_full_base_tokenizer(self):
tokenizer = PLBartTokenizer(SAMPLE_VOCAB, language_codes="base", keep_accents=True) tokenizer = PLBartTokenizer(SAMPLE_VOCAB, language_codes="base", keep_accents=True)

View File

@ -36,8 +36,9 @@ class ProphetNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = ProphetNetTokenizer tokenizer_class = ProphetNetTokenizer
test_rust_tokenizer = False test_rust_tokenizer = False
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
vocab_tokens = [ vocab_tokens = [
"[UNK]", "[UNK]",
@ -56,8 +57,8 @@ class ProphetNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
"low", "low",
"lowest", "lowest",
] ]
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
def get_input_output_texts(self, tokenizer): def get_input_output_texts(self, tokenizer):

View File

@ -14,15 +14,17 @@
# limitations under the License. # limitations under the License.
import copy
import json import json
import os import os
import unittest import unittest
from functools import lru_cache
from transformers import AddedToken, Qwen2Tokenizer, Qwen2TokenizerFast from transformers import AddedToken, Qwen2Tokenizer, Qwen2TokenizerFast
from transformers.models.qwen2.tokenization_qwen2 import VOCAB_FILES_NAMES, bytes_to_unicode from transformers.models.qwen2.tokenization_qwen2 import VOCAB_FILES_NAMES, bytes_to_unicode
from transformers.testing_utils import require_tokenizers, slow from transformers.testing_utils import require_tokenizers, slow
from ...test_tokenization_common import TokenizerTesterMixin from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
@require_tokenizers @require_tokenizers
@ -36,8 +38,9 @@ class Qwen2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_kwargs = None from_pretrained_kwargs = None
test_seq2seq = False test_seq2seq = False
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
# this make sure the vocabuary is complete at the byte level. # this make sure the vocabuary is complete at the byte level.
vocab = list(bytes_to_unicode().values()) vocab = list(bytes_to_unicode().values())
@ -81,22 +84,34 @@ class Qwen2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
"# #", "# #",
] ]
self.special_tokens_map = {"eos_token": "<|endoftext|>"} cls.special_tokens_map = {"eos_token": "<|endoftext|>"}
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(self.vocab_file, "w", encoding="utf-8") as fp: with open(cls.vocab_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(vocab_tokens) + "\n") fp.write(json.dumps(vocab_tokens) + "\n")
with open(self.merges_file, "w", encoding="utf-8") as fp: with open(cls.merges_file, "w", encoding="utf-8") as fp:
fp.write("\n".join(merges)) fp.write("\n".join(merges))
def get_tokenizer(self, **kwargs): @classmethod
kwargs.update(self.special_tokens_map) @use_cache_if_possible
return Qwen2Tokenizer.from_pretrained(self.tmpdirname, **kwargs) @lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs):
_kwargs = copy.deepcopy(cls.special_tokens_map)
_kwargs.update(kwargs)
kwargs = _kwargs
pretrained_name = pretrained_name or cls.tmpdirname
return Qwen2Tokenizer.from_pretrained(pretrained_name, **kwargs)
def get_rust_tokenizer(self, **kwargs): @classmethod
kwargs.update(self.special_tokens_map) @use_cache_if_possible
return Qwen2TokenizerFast.from_pretrained(self.tmpdirname, **kwargs) @lru_cache(maxsize=64)
def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
_kwargs = copy.deepcopy(cls.special_tokens_map)
_kwargs.update(kwargs)
kwargs = _kwargs
pretrained_name = pretrained_name or cls.tmpdirname
return Qwen2TokenizerFast.from_pretrained(pretrained_name, **kwargs)
def get_input_output_texts(self, tokenizer): def get_input_output_texts(self, tokenizer):
# this case should cover # this case should cover

View File

@ -34,11 +34,12 @@ class ReformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_seq2seq = False test_seq2seq = False
test_sentencepiece = True test_sentencepiece = True
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
tokenizer = ReformerTokenizer(SAMPLE_VOCAB, keep_accents=True) tokenizer = ReformerTokenizer(SAMPLE_VOCAB, keep_accents=True)
tokenizer.save_pretrained(self.tmpdirname) tokenizer.save_pretrained(cls.tmpdirname)
def test_convert_token_and_id(self): def test_convert_token_and_id(self):
"""Test ``_convert_token_to_id`` and ``_convert_id_to_token``.""" """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
@ -84,7 +85,7 @@ class ReformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_padding(self, max_length=15): def test_padding(self, max_length=15):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
# Simple input # Simple input
s = "This is a simple input" s = "This is a simple input"

View File

@ -39,11 +39,12 @@ class RemBertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_sentencepiece_ignore_case = True test_sentencepiece_ignore_case = True
pre_trained_model_path = "google/rembert" pre_trained_model_path = "google/rembert"
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
tokenizer = RemBertTokenizer(SAMPLE_VOCAB) tokenizer = RemBertTokenizer(SAMPLE_VOCAB)
tokenizer.save_pretrained(self.tmpdirname) tokenizer.save_pretrained(cls.tmpdirname)
# Copied from ReformerTokenizationTest.get_input_output_texts # Copied from ReformerTokenizationTest.get_input_output_texts
def get_input_output_texts(self, tokenizer): def get_input_output_texts(self, tokenizer):
@ -222,7 +223,7 @@ class RemBertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
with self.subTest("Hub -> Fast: Test loading a fast tokenizer from the hub)"): with self.subTest("Hub -> Fast: Test loading a fast tokenizer from the hub)"):
if self.rust_tokenizer_class is not None: if self.rust_tokenizer_class is not None:
tokenizer_fast = self.rust_tokenizer_class.from_pretrained(pretrained_name, eos_token=new_eos) tokenizer_fast = self.get_rust_tokenizer(pretrained_name, eos_token=new_eos)
self.assertEqual(tokenizer_fast._special_tokens_map["eos_token"], new_eos) self.assertEqual(tokenizer_fast._special_tokens_map["eos_token"], new_eos)
self.assertIn(new_eos, list(tokenizer_fast.added_tokens_decoder.values())) self.assertIn(new_eos, list(tokenizer_fast.added_tokens_decoder.values()))
# We can't test the following because for BC we kept the default rstrip lstrip in slow not fast. Will comment once normalization is alright # We can't test the following because for BC we kept the default rstrip lstrip in slow not fast. Will comment once normalization is alright

View File

@ -18,12 +18,13 @@ import itertools
import json import json
import os import os
import unittest import unittest
from functools import lru_cache
from transformers import AddedToken, RobertaTokenizer, RobertaTokenizerFast from transformers import AddedToken, RobertaTokenizer, RobertaTokenizerFast
from transformers.models.roberta.tokenization_roberta import VOCAB_FILES_NAMES from transformers.models.roberta.tokenization_roberta import VOCAB_FILES_NAMES
from transformers.testing_utils import require_tokenizers, slow from transformers.testing_utils import require_tokenizers, slow
from ...test_tokenization_common import TokenizerTesterMixin from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
@require_tokenizers @require_tokenizers
@ -34,8 +35,9 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = True test_rust_tokenizer = True
from_pretrained_kwargs = {"cls_token": "<s>"} from_pretrained_kwargs = {"cls_token": "<s>"}
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
vocab = [ vocab = [
@ -62,22 +64,30 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
] ]
vocab_tokens = dict(zip(vocab, range(len(vocab)))) vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""] merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
self.special_tokens_map = {"unk_token": "<unk>"} cls.special_tokens_map = {"unk_token": "<unk>"}
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(self.vocab_file, "w", encoding="utf-8") as fp: with open(cls.vocab_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(vocab_tokens) + "\n") fp.write(json.dumps(vocab_tokens) + "\n")
with open(self.merges_file, "w", encoding="utf-8") as fp: with open(cls.merges_file, "w", encoding="utf-8") as fp:
fp.write("\n".join(merges)) fp.write("\n".join(merges))
def get_tokenizer(self, **kwargs): @classmethod
kwargs.update(self.special_tokens_map) @use_cache_if_possible
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) @lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
def get_rust_tokenizer(self, **kwargs): @classmethod
kwargs.update(self.special_tokens_map) @use_cache_if_possible
return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) @lru_cache(maxsize=64)
def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return cls.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
def get_input_output_texts(self, tokenizer): def get_input_output_texts(self, tokenizer):
input_text = "lower newer" input_text = "lower newer"
@ -171,8 +181,8 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_embeded_special_tokens(self): def test_embeded_special_tokens(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
sentence = "A, <mask> AllenNLP sentence." sentence = "A, <mask> AllenNLP sentence."
tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True) tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True) tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
@ -202,7 +212,7 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_change_add_prefix_space_and_trim_offsets_args(self): def test_change_add_prefix_space_and_trim_offsets_args(self):
for trim_offsets, add_prefix_space in itertools.product([True, False], repeat=2): for trim_offsets, add_prefix_space in itertools.product([True, False], repeat=2):
tokenizer_r = self.rust_tokenizer_class.from_pretrained( tokenizer_r = self.get_rust_tokenizer(
self.tmpdirname, use_fast=True, add_prefix_space=add_prefix_space, trim_offsets=trim_offsets self.tmpdirname, use_fast=True, add_prefix_space=add_prefix_space, trim_offsets=trim_offsets
) )
@ -222,7 +232,7 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
text_of_1_token = "hello" # `hello` is a token in the vocabulary of `pretrained_name` text_of_1_token = "hello" # `hello` is a token in the vocabulary of `pretrained_name`
text = f"{text_of_1_token} {text_of_1_token}" text = f"{text_of_1_token} {text_of_1_token}"
tokenizer_r = self.rust_tokenizer_class.from_pretrained( tokenizer_r = self.get_rust_tokenizer(
pretrained_name, use_fast=True, add_prefix_space=True, trim_offsets=True pretrained_name, use_fast=True, add_prefix_space=True, trim_offsets=True
) )
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False) encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
@ -232,7 +242,7 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
(len(text_of_1_token) + 1, len(text_of_1_token) + 1 + len(text_of_1_token)), (len(text_of_1_token) + 1, len(text_of_1_token) + 1 + len(text_of_1_token)),
) )
tokenizer_r = self.rust_tokenizer_class.from_pretrained( tokenizer_r = self.get_rust_tokenizer(
pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=True pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=True
) )
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False) encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
@ -242,7 +252,7 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
(len(text_of_1_token) + 1, len(text_of_1_token) + 1 + len(text_of_1_token)), (len(text_of_1_token) + 1, len(text_of_1_token) + 1 + len(text_of_1_token)),
) )
tokenizer_r = self.rust_tokenizer_class.from_pretrained( tokenizer_r = self.get_rust_tokenizer(
pretrained_name, use_fast=True, add_prefix_space=True, trim_offsets=False pretrained_name, use_fast=True, add_prefix_space=True, trim_offsets=False
) )
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False) encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
@ -252,7 +262,7 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
(len(text_of_1_token), len(text_of_1_token) + 1 + len(text_of_1_token)), (len(text_of_1_token), len(text_of_1_token) + 1 + len(text_of_1_token)),
) )
tokenizer_r = self.rust_tokenizer_class.from_pretrained( tokenizer_r = self.get_rust_tokenizer(
pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=False pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=False
) )
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False) encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
@ -274,7 +284,7 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
# (1 + len(text_of_1_token) + 1, 1 + len(text_of_1_token) + 1 + len(text_of_1_token)), # (1 + len(text_of_1_token) + 1, 1 + len(text_of_1_token) + 1 + len(text_of_1_token)),
# ) # )
tokenizer_r = self.rust_tokenizer_class.from_pretrained( tokenizer_r = self.get_rust_tokenizer(
pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=True pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=True
) )
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False) encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
@ -284,7 +294,7 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
(1 + len(text_of_1_token) + 1, 1 + len(text_of_1_token) + 1 + len(text_of_1_token)), (1 + len(text_of_1_token) + 1, 1 + len(text_of_1_token) + 1 + len(text_of_1_token)),
) )
tokenizer_r = self.rust_tokenizer_class.from_pretrained( tokenizer_r = self.get_rust_tokenizer(
pretrained_name, use_fast=True, add_prefix_space=True, trim_offsets=False pretrained_name, use_fast=True, add_prefix_space=True, trim_offsets=False
) )
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False) encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
@ -294,7 +304,7 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
(1 + len(text_of_1_token), 1 + len(text_of_1_token) + 1 + len(text_of_1_token)), (1 + len(text_of_1_token), 1 + len(text_of_1_token) + 1 + len(text_of_1_token)),
) )
tokenizer_r = self.rust_tokenizer_class.from_pretrained( tokenizer_r = self.get_rust_tokenizer(
pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=False pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=False
) )
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False) encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)

View File

@ -41,8 +41,9 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
space_between_special_tokens = True space_between_special_tokens = True
from_pretrained_filter = filter_non_english from_pretrained_filter = filter_non_english
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]", "", "", "", "", "a", "b", "c", "d"] vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]", "", "", "", "", "a", "b", "c", "d"]
word_shape = {} word_shape = {}
@ -50,14 +51,14 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
for i, value in enumerate(vocab_tokens): for i, value in enumerate(vocab_tokens):
word_shape[value] = i word_shape[value] = i
word_pronunciation[value] = i word_pronunciation[value] = i
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
self.word_shape_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["word_shape_file"]) cls.word_shape_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["word_shape_file"])
self.word_pronunciation_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["word_pronunciation_file"]) cls.word_pronunciation_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["word_pronunciation_file"])
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
with open(self.word_shape_file, "w", encoding="utf-8") as word_shape_writer: with open(cls.word_shape_file, "w", encoding="utf-8") as word_shape_writer:
json.dump(word_shape, word_shape_writer, ensure_ascii=False) json.dump(word_shape, word_shape_writer, ensure_ascii=False)
with open(self.word_pronunciation_file, "w", encoding="utf-8") as word_pronunciation_writer: with open(cls.word_pronunciation_file, "w", encoding="utf-8") as word_pronunciation_writer:
json.dump(word_pronunciation, word_pronunciation_writer, ensure_ascii=False) json.dump(word_pronunciation, word_pronunciation_writer, ensure_ascii=False)
def test_full_tokenizer(self): def test_full_tokenizer(self):
@ -204,7 +205,7 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_offsets_with_special_characters(self): def test_offsets_with_special_characters(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence." sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence."
tokens = tokenizer_r.encode_plus( tokens = tokenizer_r.encode_plus(
@ -260,8 +261,8 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
kwargs["tokenize_chinese_chars"] = True kwargs["tokenize_chinese_chars"] = True
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False) ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)
ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False) ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
@ -274,8 +275,8 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertListEqual(tokens_without_spe_char_r, list_of_commun_chinese_char) self.assertListEqual(tokens_without_spe_char_r, list_of_commun_chinese_char)
kwargs["tokenize_chinese_chars"] = False kwargs["tokenize_chinese_chars"] = False
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False) ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False) ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)

View File

@ -15,11 +15,12 @@
import tempfile import tempfile
import unittest import unittest
from functools import lru_cache
from transformers import RoFormerTokenizer, RoFormerTokenizerFast from transformers import RoFormerTokenizer, RoFormerTokenizerFast
from transformers.testing_utils import require_rjieba, require_tokenizers from transformers.testing_utils import require_rjieba, require_tokenizers
from ...test_tokenization_common import TokenizerTesterMixin from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
@require_rjieba @require_rjieba
@ -31,14 +32,25 @@ class RoFormerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
space_between_special_tokens = True space_between_special_tokens = True
test_rust_tokenizer = True test_rust_tokenizer = True
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
tokenizer = cls.tokenizer_class.from_pretrained("junnyu/roformer_chinese_base")
tokenizer.save_pretrained(cls.tmpdirname)
def get_tokenizer(self, **kwargs): @classmethod
return self.tokenizer_class.from_pretrained("junnyu/roformer_chinese_base", **kwargs) @use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs):
pretrained_name = pretrained_name or cls.tmpdirname
return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
def get_rust_tokenizer(self, **kwargs): @classmethod
return self.rust_tokenizer_class.from_pretrained("junnyu/roformer_chinese_base", **kwargs) @use_cache_if_possible
@lru_cache(maxsize=64)
def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
pretrained_name = pretrained_name or cls.tmpdirname
return cls.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
def get_chinese_input_output_texts(self): def get_chinese_input_output_texts(self):
input_text = "永和服装饰品有限公司,今天天气非常好" input_text = "永和服装饰品有限公司,今天天气非常好"

View File

@ -59,12 +59,13 @@ class SeamlessM4TTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_sentencepiece = True test_sentencepiece = True
from_pretrained_kwargs = {} from_pretrained_kwargs = {}
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
# We have a SentencePiece fixture for testing # We have a SentencePiece fixture for testing
tokenizer = SeamlessM4TTokenizer(SAMPLE_VOCAB, keep_accents=True) tokenizer = SeamlessM4TTokenizer(SAMPLE_VOCAB, keep_accents=True)
tokenizer.save_pretrained(self.tmpdirname) tokenizer.save_pretrained(cls.tmpdirname)
def test_full_tokenizer(self): def test_full_tokenizer(self):
tokenizer = SeamlessM4TTokenizer(SAMPLE_VOCAB, keep_accents=True) tokenizer = SeamlessM4TTokenizer(SAMPLE_VOCAB, keep_accents=True)
@ -353,7 +354,7 @@ class SeamlessM4TTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
added_tokens = [AddedToken("<special>", lstrip=True)] added_tokens = [AddedToken("<special>", lstrip=True)]
tokenizer_r = self.rust_tokenizer_class.from_pretrained( tokenizer_r = self.get_rust_tokenizer(
pretrained_name, additional_special_tokens=added_tokens, **kwargs pretrained_name, additional_special_tokens=added_tokens, **kwargs
) )
r_output = tokenizer_r.encode("Hey this is a <special> token") r_output = tokenizer_r.encode("Hey this is a <special> token")
@ -363,7 +364,7 @@ class SeamlessM4TTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertTrue(special_token_id in r_output) self.assertTrue(special_token_id in r_output)
if self.test_slow_tokenizer: if self.test_slow_tokenizer:
tokenizer_cr = self.rust_tokenizer_class.from_pretrained( tokenizer_cr = self.get_rust_tokenizer(
pretrained_name, pretrained_name,
additional_special_tokens=added_tokens, additional_special_tokens=added_tokens,
**kwargs, # , from_slow=True <- unfortunately too slow to convert **kwargs, # , from_slow=True <- unfortunately too slow to convert

View File

@ -17,12 +17,13 @@ import json
import os import os
import tempfile import tempfile
import unittest import unittest
from functools import lru_cache
from transformers import SPIECE_UNDERLINE, AddedToken, BatchEncoding, SiglipTokenizer from transformers import SPIECE_UNDERLINE, AddedToken, BatchEncoding, SiglipTokenizer
from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, slow from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, slow
from transformers.utils import cached_property, is_tf_available, is_torch_available from transformers.utils import cached_property, is_tf_available, is_torch_available
from ...test_tokenization_common import TokenizerTesterMixin from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
@ -44,13 +45,13 @@ class SiglipTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_sentencepiece = True test_sentencepiece = True
test_sentencepiece_ignore_case = True test_sentencepiece_ignore_case = True
# Copied from tests.models.t5.test_tokenization_t5.T5TokenizationTest.setUp with T5->Siglip @classmethod
def setUp(self): def setUpClass(cls):
super().setUp() super().setUpClass()
# We have a SentencePiece fixture for testing # We have a SentencePiece fixture for testing
tokenizer = SiglipTokenizer(SAMPLE_VOCAB) tokenizer = SiglipTokenizer(SAMPLE_VOCAB)
tokenizer.save_pretrained(self.tmpdirname) tokenizer.save_pretrained(cls.tmpdirname)
# Copied from tests.models.t5.test_tokenization_t5.T5TokenizationTest.test_convert_token_and_id with T5->Siglip # Copied from tests.models.t5.test_tokenization_t5.T5TokenizationTest.test_convert_token_and_id with T5->Siglip
def test_convert_token_and_id(self): def test_convert_token_and_id(self):
@ -135,9 +136,12 @@ class SiglipTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def siglip_tokenizer(self): def siglip_tokenizer(self):
return SiglipTokenizer.from_pretrained("google/siglip-base-patch16-224") return SiglipTokenizer.from_pretrained("google/siglip-base-patch16-224")
# Copied from tests.models.t5.test_tokenization_t5.T5TokenizationTest.get_tokenizer with T5->Siglip @classmethod
def get_tokenizer(self, **kwargs) -> SiglipTokenizer: @use_cache_if_possible
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) @lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs) -> SiglipTokenizer:
pretrained_name = pretrained_name or cls.tmpdirname
return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
# Copied from tests.models.t5.test_tokenization_t5.T5TokenizationTest.test_rust_and_python_full_tokenizers with T5->Siglip # Copied from tests.models.t5.test_tokenization_t5.T5TokenizationTest.test_rust_and_python_full_tokenizers with T5->Siglip
def test_rust_and_python_full_tokenizers(self): def test_rust_and_python_full_tokenizers(self):
@ -227,10 +231,10 @@ class SiglipTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
added_tokens = [f"<extra_id_{i}>" for i in range(100)] + [AddedToken("<special>", lstrip=True)] added_tokens = [f"<extra_id_{i}>" for i in range(100)] + [AddedToken("<special>", lstrip=True)]
tokenizer_r = self.rust_tokenizer_class.from_pretrained( tokenizer_r = self.get_rust_tokenizer(
pretrained_name, additional_special_tokens=added_tokens, **kwargs pretrained_name, additional_special_tokens=added_tokens, **kwargs
) )
tokenizer_cr = self.rust_tokenizer_class.from_pretrained( tokenizer_cr = self.get_rust_tokenizer(
pretrained_name, additional_special_tokens=added_tokens, **kwargs, from_slow=True pretrained_name, additional_special_tokens=added_tokens, **kwargs, from_slow=True
) )
tokenizer_p = self.tokenizer_class.from_pretrained( tokenizer_p = self.tokenizer_class.from_pretrained(

View File

@ -42,8 +42,9 @@ class SpeechToTextTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = False test_rust_tokenizer = False
test_sentencepiece = True test_sentencepiece = True
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
spm_model = sp.SentencePieceProcessor() spm_model = sp.SentencePieceProcessor()
spm_model.Load(SAMPLE_VOCAB) spm_model.Load(SAMPLE_VOCAB)
@ -52,13 +53,13 @@ class SpeechToTextTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
vocab += [spm_model.IdToPiece(id_) for id_ in range(len(spm_model))] vocab += [spm_model.IdToPiece(id_) for id_ in range(len(spm_model))]
vocab_tokens = dict(zip(vocab, range(len(vocab)))) vocab_tokens = dict(zip(vocab, range(len(vocab))))
save_dir = Path(self.tmpdirname) save_dir = Path(cls.tmpdirname)
save_json(vocab_tokens, save_dir / VOCAB_FILES_NAMES["vocab_file"]) save_json(vocab_tokens, save_dir / VOCAB_FILES_NAMES["vocab_file"])
if not (save_dir / VOCAB_FILES_NAMES["spm_file"]).exists(): if not (save_dir / VOCAB_FILES_NAMES["spm_file"]).exists():
copyfile(SAMPLE_VOCAB, save_dir / VOCAB_FILES_NAMES["spm_file"]) copyfile(SAMPLE_VOCAB, save_dir / VOCAB_FILES_NAMES["spm_file"])
tokenizer = Speech2TextTokenizer.from_pretrained(self.tmpdirname) tokenizer = Speech2TextTokenizer.from_pretrained(cls.tmpdirname)
tokenizer.save_pretrained(self.tmpdirname) tokenizer.save_pretrained(cls.tmpdirname)
def test_convert_token_and_id(self): def test_convert_token_and_id(self):
"""Test ``_convert_token_to_id`` and ``_convert_id_to_token``.""" """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""

View File

@ -35,8 +35,9 @@ class SpeechT5TokenizerTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = False test_rust_tokenizer = False
test_sentencepiece = True test_sentencepiece = True
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
# We have a SentencePiece fixture for testing # We have a SentencePiece fixture for testing
tokenizer = SpeechT5Tokenizer(SAMPLE_VOCAB) tokenizer = SpeechT5Tokenizer(SAMPLE_VOCAB)
@ -46,7 +47,7 @@ class SpeechT5TokenizerTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer.add_special_tokens({"mask_token": mask_token}) tokenizer.add_special_tokens({"mask_token": mask_token})
tokenizer.add_tokens(["<ctc_blank>"]) tokenizer.add_tokens(["<ctc_blank>"])
tokenizer.save_pretrained(self.tmpdirname) tokenizer.save_pretrained(cls.tmpdirname)
def get_input_output_texts(self, tokenizer): def get_input_output_texts(self, tokenizer):
input_text = "this is a test" input_text = "this is a test"

View File

@ -13,8 +13,9 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import unittest import unittest
from functools import lru_cache
from tests.test_tokenization_common import TokenizerTesterMixin from tests.test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
from transformers import SplinterTokenizerFast, is_tf_available, is_torch_available from transformers import SplinterTokenizerFast, is_tf_available, is_torch_available
from transformers.models.splinter import SplinterTokenizer from transformers.models.splinter import SplinterTokenizer
from transformers.testing_utils import get_tests_dir, slow from transformers.testing_utils import get_tests_dir, slow
@ -40,20 +41,29 @@ class SplinterTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
pre_trained_model_path = "tau/splinter-base" pre_trained_model_path = "tau/splinter-base"
# Copied from transformers.models.siglip.SiglipTokenizationTest.setUp # Copied from transformers.models.siglip.SiglipTokenizationTest.setUp
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
tokenizer = SplinterTokenizer(SAMPLE_VOCAB) tokenizer = SplinterTokenizer(SAMPLE_VOCAB)
tokenizer.vocab["[UNK]"] = len(tokenizer.vocab) tokenizer.vocab["[UNK]"] = len(tokenizer.vocab)
tokenizer.vocab["[QUESTION]"] = len(tokenizer.vocab) tokenizer.vocab["[QUESTION]"] = len(tokenizer.vocab)
tokenizer.vocab["."] = len(tokenizer.vocab) tokenizer.vocab["."] = len(tokenizer.vocab)
tokenizer.add_tokens("this is a test thou shall not determine rigor truly".split()) tokenizer.add_tokens("this is a test thou shall not determine rigor truly".split())
tokenizer.save_pretrained(self.tmpdirname) tokenizer.save_pretrained(cls.tmpdirname)
def get_tokenizer(self, **kwargs) -> SplinterTokenizer: @classmethod
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) @use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs) -> SplinterTokenizer:
pretrained_name = pretrained_name or cls.tmpdirname
return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
def get_rust_tokenizer(self, **kwargs) -> SplinterTokenizerFast: @classmethod
return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) @use_cache_if_possible
@lru_cache(maxsize=64)
def get_rust_tokenizer(cls, pretrained_name=None, **kwargs) -> SplinterTokenizerFast:
pretrained_name = pretrained_name or cls.tmpdirname
return cls.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
# Copied from transformers.models.siglip.SiglipTokenizationTest.test_get_vocab # Copied from transformers.models.siglip.SiglipTokenizationTest.test_get_vocab
def test_get_vocab(self): def test_get_vocab(self):

View File

@ -13,22 +13,31 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from functools import lru_cache
from transformers import SqueezeBertTokenizer, SqueezeBertTokenizerFast from transformers import SqueezeBertTokenizer, SqueezeBertTokenizerFast
from transformers.testing_utils import require_tokenizers, slow from transformers.testing_utils import require_tokenizers, slow
from ..bert.test_tokenization_bert import BertTokenizationTest from ...test_tokenization_common import use_cache_if_possible
# Avoid import `BertTokenizationTest` directly as it will run as `test_tokenization_squeezebert.py::BertTokenizationTest`
# together with `test_tokenization_bert.py::BertTokenizationTest`.
from ..bert import test_tokenization_bert
@require_tokenizers @require_tokenizers
class SqueezeBertTokenizationTest(BertTokenizationTest): class SqueezeBertTokenizationTest(test_tokenization_bert.BertTokenizationTest):
tokenizer_class = SqueezeBertTokenizer tokenizer_class = SqueezeBertTokenizer
rust_tokenizer_class = SqueezeBertTokenizerFast rust_tokenizer_class = SqueezeBertTokenizerFast
test_rust_tokenizer = True test_rust_tokenizer = True
from_pretrained_id = "squeezebert/squeezebert-uncased" from_pretrained_id = "squeezebert/squeezebert-uncased"
def get_rust_tokenizer(self, **kwargs): @classmethod
return SqueezeBertTokenizerFast.from_pretrained(self.tmpdirname, **kwargs) @use_cache_if_possible
@lru_cache(maxsize=64)
def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
pretrained_name = pretrained_name or cls.tmpdirname
return SqueezeBertTokenizerFast.from_pretrained(pretrained_name, **kwargs)
@slow @slow
def test_sequence_builders(self): def test_sequence_builders(self):

View File

@ -17,12 +17,13 @@ import os
import re import re
import tempfile import tempfile
import unittest import unittest
from functools import lru_cache
from transformers import SPIECE_UNDERLINE, AddedToken, BatchEncoding, T5Tokenizer, T5TokenizerFast from transformers import SPIECE_UNDERLINE, AddedToken, BatchEncoding, T5Tokenizer, T5TokenizerFast
from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_seqio, require_tokenizers, slow from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_seqio, require_tokenizers, slow
from transformers.utils import cached_property, is_tf_available, is_torch_available from transformers.utils import cached_property, is_tf_available, is_torch_available
from ...test_tokenization_common import TokenizerTesterMixin from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
@ -44,12 +45,13 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = True test_rust_tokenizer = True
test_sentencepiece = True test_sentencepiece = True
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
# We have a SentencePiece fixture for testing # We have a SentencePiece fixture for testing
tokenizer = T5Tokenizer(SAMPLE_VOCAB) tokenizer = T5Tokenizer(SAMPLE_VOCAB)
tokenizer.save_pretrained(self.tmpdirname) tokenizer.save_pretrained(cls.tmpdirname)
def test_convert_token_and_id(self): def test_convert_token_and_id(self):
"""Test ``_convert_token_to_id`` and ``_convert_id_to_token``.""" """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
@ -145,11 +147,19 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def t5_base_tokenizer_fast(self): def t5_base_tokenizer_fast(self):
return T5TokenizerFast.from_pretrained("google-t5/t5-base") return T5TokenizerFast.from_pretrained("google-t5/t5-base")
def get_tokenizer(self, **kwargs) -> T5Tokenizer: @classmethod
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) @use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs) -> T5Tokenizer:
pretrained_name = pretrained_name or cls.tmpdirname
return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
def get_rust_tokenizer(self, **kwargs) -> T5TokenizerFast: @classmethod
return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) @use_cache_if_possible
@lru_cache(maxsize=64)
def get_rust_tokenizer(cls, pretrained_name=None, **kwargs) -> T5TokenizerFast:
pretrained_name = pretrained_name or cls.tmpdirname
return cls.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
def test_rust_and_python_full_tokenizers(self): def test_rust_and_python_full_tokenizers(self):
if not self.test_rust_tokenizer: if not self.test_rust_tokenizer:
@ -275,10 +285,10 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
added_tokens = [f"<extra_id_{i}>" for i in range(100)] + [AddedToken("<special>", lstrip=True)] added_tokens = [f"<extra_id_{i}>" for i in range(100)] + [AddedToken("<special>", lstrip=True)]
tokenizer_r = self.rust_tokenizer_class.from_pretrained( tokenizer_r = self.get_rust_tokenizer(
pretrained_name, additional_special_tokens=added_tokens, **kwargs pretrained_name, additional_special_tokens=added_tokens, **kwargs
) )
tokenizer_cr = self.rust_tokenizer_class.from_pretrained( tokenizer_cr = self.get_rust_tokenizer(
pretrained_name, additional_special_tokens=added_tokens, **kwargs, from_slow=True pretrained_name, additional_special_tokens=added_tokens, **kwargs, from_slow=True
) )
tokenizer_p = self.tokenizer_class.from_pretrained( tokenizer_p = self.tokenizer_class.from_pretrained(
@ -460,10 +470,8 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
EXPECTED_WITH_SPACE = [9459, 149, 33, 25, 692, 1] EXPECTED_WITH_SPACE = [9459, 149, 33, 25, 692, 1]
EXPECTED_WO_SPACE = [3845, 63, 149, 33, 25, 692, 1] EXPECTED_WO_SPACE = [3845, 63, 149, 33, 25, 692, 1]
slow_ = self.tokenizer_class.from_pretrained(pretrained_name, add_prefix_space=False, legacy=False) slow_ = self.get_tokenizer(pretrained_name, add_prefix_space=False, legacy=False)
fast_ = self.rust_tokenizer_class.from_pretrained( fast_ = self.get_rust_tokenizer(pretrained_name, add_prefix_space=False, legacy=False, from_slow=True)
pretrained_name, add_prefix_space=False, legacy=False, from_slow=True
)
self.assertEqual(slow_.encode(inputs), EXPECTED_WO_SPACE) self.assertEqual(slow_.encode(inputs), EXPECTED_WO_SPACE)
self.assertEqual(slow_.encode(inputs), fast_.encode(inputs)) self.assertEqual(slow_.encode(inputs), fast_.encode(inputs))
self.assertEqual(slow_.tokenize(inputs), ["He", "y", "▁how", "▁are", "▁you", "▁doing"]) self.assertEqual(slow_.tokenize(inputs), ["He", "y", "▁how", "▁are", "▁you", "▁doing"])
@ -473,8 +481,8 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
fast_.decode(EXPECTED_WO_SPACE, skip_special_tokens=True), fast_.decode(EXPECTED_WO_SPACE, skip_special_tokens=True),
) )
slow_ = self.tokenizer_class.from_pretrained(pretrained_name, add_prefix_space=True, legacy=False) slow_ = self.get_tokenizer(pretrained_name, add_prefix_space=True, legacy=False)
fast_ = self.rust_tokenizer_class.from_pretrained(pretrained_name, add_prefix_space=True, legacy=False) fast_ = self.get_rust_tokenizer(pretrained_name, add_prefix_space=True, legacy=False)
self.assertEqual(slow_.encode(inputs), EXPECTED_WITH_SPACE) self.assertEqual(slow_.encode(inputs), EXPECTED_WITH_SPACE)
self.assertEqual(slow_.encode(inputs), fast_.encode(inputs)) self.assertEqual(slow_.encode(inputs), fast_.encode(inputs))
self.assertEqual(slow_.tokenize(inputs), ["▁Hey", "▁how", "▁are", "▁you", "▁doing"]) self.assertEqual(slow_.tokenize(inputs), ["▁Hey", "▁how", "▁are", "▁you", "▁doing"])

View File

@ -112,8 +112,9 @@ class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
return output_txt, output_ids return output_txt, output_ids
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
vocab_tokens = [ vocab_tokens = [
"[UNK]", "[UNK]",
@ -132,8 +133,8 @@ class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
"low", "low",
"lowest", "lowest",
] ]
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
def get_input_output_texts(self, tokenizer): def get_input_output_texts(self, tokenizer):
@ -352,7 +353,7 @@ class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_offsets_with_special_characters(self): def test_offsets_with_special_characters(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence." sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence."
tokens = tokenizer_r.encode_plus( tokens = tokenizer_r.encode_plus(

View File

@ -93,12 +93,13 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
return questions, words, boxes return questions, words, boxes
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
# We have a SentencePiece fixture for testing # We have a SentencePiece fixture for testing
tokenizer = UdopTokenizer(SAMPLE_VOCAB, keep_accents=True) tokenizer = UdopTokenizer(SAMPLE_VOCAB, keep_accents=True)
tokenizer.save_pretrained(self.tmpdirname) tokenizer.save_pretrained(cls.tmpdirname)
def get_input_output_texts(self, tokenizer): def get_input_output_texts(self, tokenizer):
input_text = "UNwant\u00e9d,running" input_text = "UNwant\u00e9d,running"
@ -456,8 +457,8 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_padding(self, max_length=50): def test_padding(self, max_length=50):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id) self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
pad_token_id = tokenizer_p.pad_token_id pad_token_id = tokenizer_p.pad_token_id
@ -922,8 +923,8 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
# Input tokens id # Input tokens id
words, boxes = self.get_words_and_boxes() words, boxes = self.get_words_and_boxes()
@ -1109,7 +1110,7 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_offsets_mapping(self): def test_offsets_mapping(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
text = ["a", "wonderful", "test"] text = ["a", "wonderful", "test"]
boxes = [[1, 8, 12, 20] for _ in range(len(text))] boxes = [[1, 8, 12, 20] for _ in range(len(text))]
@ -1239,8 +1240,8 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
words, boxes = self.get_words_and_boxes() words, boxes = self.get_words_and_boxes()
@ -1293,8 +1294,8 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
words, boxes = self.get_words_and_boxes() words, boxes = self.get_words_and_boxes()
tokens_r = tokenizer_r.encode_plus_boxes( tokens_r = tokenizer_r.encode_plus_boxes(
words, words,
@ -1320,7 +1321,7 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_compare_add_special_tokens(self): def test_compare_add_special_tokens(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
simple_num_special_tokens_to_add = tokenizer_r.num_special_tokens_to_add(pair=False) simple_num_special_tokens_to_add = tokenizer_r.num_special_tokens_to_add(pair=False)
@ -1402,7 +1403,7 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
added_tokens = [AddedToken("<special>", lstrip=True)] added_tokens = [AddedToken("<special>", lstrip=True)]
tokenizer_r = self.rust_tokenizer_class.from_pretrained( tokenizer_r = self.get_rust_tokenizer(
pretrained_name, additional_special_tokens=added_tokens, **kwargs pretrained_name, additional_special_tokens=added_tokens, **kwargs
) )
words = "Hey this is a <special> token".split() words = "Hey this is a <special> token".split()
@ -1416,7 +1417,7 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertTrue(special_token_id in r_output) self.assertTrue(special_token_id in r_output)
if self.test_slow_tokenizer: if self.test_slow_tokenizer:
tokenizer_cr = self.rust_tokenizer_class.from_pretrained( tokenizer_cr = self.get_rust_tokenizer(
pretrained_name, additional_special_tokens=added_tokens, **kwargs, from_slow=True pretrained_name, additional_special_tokens=added_tokens, **kwargs, from_slow=True
) )
tokenizer_p = self.tokenizer_class.from_pretrained( tokenizer_p = self.tokenizer_class.from_pretrained(
@ -1591,8 +1592,8 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id) self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
pad_token_id = tokenizer_p.pad_token_id pad_token_id = tokenizer_p.pad_token_id

View File

@ -19,12 +19,13 @@ import os
import shutil import shutil
import tempfile import tempfile
import unittest import unittest
from functools import lru_cache
from transformers import VitsTokenizer from transformers import VitsTokenizer
from transformers.models.vits.tokenization_vits import VOCAB_FILES_NAMES from transformers.models.vits.tokenization_vits import VOCAB_FILES_NAMES
from transformers.testing_utils import slow from transformers.testing_utils import slow
from ...test_tokenization_common import TokenizerTesterMixin from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
class VitsTokenizerTest(TokenizerTesterMixin, unittest.TestCase): class VitsTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
@ -32,8 +33,9 @@ class VitsTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = VitsTokenizer tokenizer_class = VitsTokenizer
test_rust_tokenizer = False test_rust_tokenizer = False
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
vocab = ( vocab = (
"k ' z y u d h e s w 3 c p - 1 j m i X f l o 0 b r a 4 2 n _ x v t q 5 6 g ț ţ < > | <pad> <unk>".split( "k ' z y u d h e s w 3 c p - 1 j m i X f l o 0 b r a 4 2 n _ x v t q 5 6 g ț ţ < > | <pad> <unk>".split(
@ -44,18 +46,22 @@ class VitsTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
vocab_tokens[" "] = vocab_tokens["X"] vocab_tokens[" "] = vocab_tokens["X"]
del vocab_tokens["X"] del vocab_tokens["X"]
self.special_tokens_map = {"pad_token": "<pad>", "unk_token": "<unk>"} cls.special_tokens_map = {"pad_token": "<pad>", "unk_token": "<unk>"}
self.tmpdirname = tempfile.mkdtemp() cls.tmpdirname = tempfile.mkdtemp()
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(self.vocab_file, "w", encoding="utf-8") as fp: with open(cls.vocab_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(vocab_tokens) + "\n") fp.write(json.dumps(vocab_tokens) + "\n")
def get_tokenizer(self, **kwargs): @classmethod
kwargs.update(self.special_tokens_map) @use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
kwargs["phonemize"] = False kwargs["phonemize"] = False
kwargs["normalize"] = False kwargs["normalize"] = False
return VitsTokenizer.from_pretrained(self.tmpdirname, **kwargs) pretrained_name = pretrained_name or cls.tmpdirname
return VitsTokenizer.from_pretrained(pretrained_name, **kwargs)
def get_clean_sequence(self, tokenizer, with_prefix_space=False, max_length=20, min_length=5): def get_clean_sequence(self, tokenizer, with_prefix_space=False, max_length=20, min_length=5):
txt = "beyonce lives in los angeles" txt = "beyonce lives in los angeles"

View File

@ -21,6 +21,7 @@ import random
import shutil import shutil
import tempfile import tempfile
import unittest import unittest
from functools import lru_cache
import numpy as np import numpy as np
@ -33,7 +34,7 @@ from transformers import (
from transformers.models.wav2vec2.tokenization_wav2vec2 import VOCAB_FILES_NAMES, Wav2Vec2CTCTokenizerOutput from transformers.models.wav2vec2.tokenization_wav2vec2 import VOCAB_FILES_NAMES, Wav2Vec2CTCTokenizerOutput
from transformers.testing_utils import require_torch, slow from transformers.testing_utils import require_torch, slow
from ...test_tokenization_common import TokenizerTesterMixin from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
global_rng = random.Random() global_rng = random.Random()
@ -57,22 +58,27 @@ def floats_list(shape, scale=1.0, rng=None, name=None):
class Wav2Vec2TokenizerTest(unittest.TestCase): class Wav2Vec2TokenizerTest(unittest.TestCase):
tokenizer_class = Wav2Vec2Tokenizer tokenizer_class = Wav2Vec2Tokenizer
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
vocab = "<pad> <s> </s> <unk> | E T A O N I H S R D L U M W C F G Y P B V K ' X J Q Z".split(" ") vocab = "<pad> <s> </s> <unk> | E T A O N I H S R D L U M W C F G Y P B V K ' X J Q Z".split(" ")
vocab_tokens = dict(zip(vocab, range(len(vocab)))) vocab_tokens = dict(zip(vocab, range(len(vocab))))
self.special_tokens_map = {"pad_token": "<pad>", "unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"} cls.special_tokens_map = {"pad_token": "<pad>", "unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"}
self.tmpdirname = tempfile.mkdtemp() cls.tmpdirname = tempfile.mkdtemp()
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(self.vocab_file, "w", encoding="utf-8") as fp: with open(cls.vocab_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(vocab_tokens) + "\n") fp.write(json.dumps(vocab_tokens) + "\n")
def get_tokenizer(self, **kwargs): @classmethod
kwargs.update(self.special_tokens_map) @use_cache_if_possible
return Wav2Vec2Tokenizer.from_pretrained(self.tmpdirname, **kwargs) @lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return Wav2Vec2Tokenizer.from_pretrained(pretrained_name, **kwargs)
def test_tokenizer_decode(self): def test_tokenizer_decode(self):
# TODO(PVP) - change to facebook # TODO(PVP) - change to facebook
@ -237,7 +243,7 @@ class Wav2Vec2TokenizerTest(unittest.TestCase):
def test_save_pretrained(self): def test_save_pretrained(self):
pretrained_name = list(self.tokenizer_class.pretrained_vocab_files_map["vocab_file"].keys())[0] pretrained_name = list(self.tokenizer_class.pretrained_vocab_files_map["vocab_file"].keys())[0]
tokenizer = self.tokenizer_class.from_pretrained(pretrained_name) tokenizer = self.get_tokenizer(pretrained_name)
tmpdirname2 = tempfile.mkdtemp() tmpdirname2 = tempfile.mkdtemp()
tokenizer_files = tokenizer.save_pretrained(tmpdirname2) tokenizer_files = tokenizer.save_pretrained(tmpdirname2)
@ -373,22 +379,27 @@ class Wav2Vec2CTCTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = Wav2Vec2CTCTokenizer tokenizer_class = Wav2Vec2CTCTokenizer
test_rust_tokenizer = False test_rust_tokenizer = False
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
vocab = "<pad> <s> </s> <unk> | E T A O N I H S R D L U M W C F G Y P B V K ' X J Q Z".split(" ") vocab = "<pad> <s> </s> <unk> | E T A O N I H S R D L U M W C F G Y P B V K ' X J Q Z".split(" ")
vocab_tokens = dict(zip(vocab, range(len(vocab)))) vocab_tokens = dict(zip(vocab, range(len(vocab))))
self.special_tokens_map = {"pad_token": "<pad>", "unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"} cls.special_tokens_map = {"pad_token": "<pad>", "unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"}
self.tmpdirname = tempfile.mkdtemp() cls.tmpdirname = tempfile.mkdtemp()
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(self.vocab_file, "w", encoding="utf-8") as fp: with open(cls.vocab_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(vocab_tokens) + "\n") fp.write(json.dumps(vocab_tokens) + "\n")
def get_tokenizer(self, **kwargs): @classmethod
kwargs.update(self.special_tokens_map) @use_cache_if_possible
return Wav2Vec2CTCTokenizer.from_pretrained(self.tmpdirname, **kwargs) @lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return Wav2Vec2CTCTokenizer.from_pretrained(pretrained_name, **kwargs)
def test_tokenizer_add_token_chars(self): def test_tokenizer_add_token_chars(self):
tokenizer = self.tokenizer_class.from_pretrained("facebook/wav2vec2-base-960h") tokenizer = self.tokenizer_class.from_pretrained("facebook/wav2vec2-base-960h")

View File

@ -17,6 +17,7 @@
import json import json
import os import os
import unittest import unittest
from functools import lru_cache
from typing import Tuple from typing import Tuple
from transformers import Wav2Vec2PhonemeCTCTokenizer from transformers import Wav2Vec2PhonemeCTCTokenizer
@ -24,7 +25,7 @@ from transformers.models.wav2vec2.tokenization_wav2vec2 import VOCAB_FILES_NAMES
from transformers.models.wav2vec2_phoneme.tokenization_wav2vec2_phoneme import Wav2Vec2PhonemeCTCTokenizerOutput from transformers.models.wav2vec2_phoneme.tokenization_wav2vec2_phoneme import Wav2Vec2PhonemeCTCTokenizerOutput
from transformers.testing_utils import require_phonemizer from transformers.testing_utils import require_phonemizer
from ...test_tokenization_common import TokenizerTesterMixin from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
@require_phonemizer @require_phonemizer
@ -33,8 +34,9 @@ class Wav2Vec2PhonemeCTCTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = Wav2Vec2PhonemeCTCTokenizer tokenizer_class = Wav2Vec2PhonemeCTCTokenizer
test_rust_tokenizer = False test_rust_tokenizer = False
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
vocab = ( vocab = (
"<s> <pad> </s> <unk> n s t ə l a i k d m ɛ ɾ e ɪ p o ɐ z ð f j v b ɹ ʁ ʊ iː r w ʌ u ɡ æ aɪ ʃ h ɔ ɑː " "<s> <pad> </s> <unk> n s t ə l a i k d m ɛ ɾ e ɪ p o ɐ z ð f j v b ɹ ʁ ʊ iː r w ʌ u ɡ æ aɪ ʃ h ɔ ɑː "
@ -53,10 +55,10 @@ class Wav2Vec2PhonemeCTCTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
).split(" ") ).split(" ")
vocab_tokens = dict(zip(vocab, range(len(vocab)))) vocab_tokens = dict(zip(vocab, range(len(vocab))))
self.special_tokens_map = {"pad_token": "<pad>", "unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"} cls.special_tokens_map = {"pad_token": "<pad>", "unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"}
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(self.vocab_file, "w", encoding="utf-8") as fp: with open(cls.vocab_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(vocab_tokens) + "\n") fp.write(json.dumps(vocab_tokens) + "\n")
# overwrite since phonemes require specific creation # overwrite since phonemes require specific creation
@ -84,9 +86,13 @@ class Wav2Vec2PhonemeCTCTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
output_ids = tokenizer.encode(output_txt, add_special_tokens=False) output_ids = tokenizer.encode(output_txt, add_special_tokens=False)
return output_txt, output_ids return output_txt, output_ids
def get_tokenizer(self, **kwargs): @classmethod
kwargs.update(self.special_tokens_map) @use_cache_if_possible
return Wav2Vec2PhonemeCTCTokenizer.from_pretrained(self.tmpdirname, **kwargs) @lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return Wav2Vec2PhonemeCTCTokenizer.from_pretrained(pretrained_name, **kwargs)
def test_tokenizer_add_new_tokens(self): def test_tokenizer_add_new_tokens(self):
tokenizer = self.tokenizer_class.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft") tokenizer = self.tokenizer_class.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft")

View File

@ -40,12 +40,13 @@ class WhisperTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
test_sentencepiece = False test_sentencepiece = False
test_seq2seq = False test_seq2seq = False
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny") tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny")
tokenizer.pad_token_id = 50256 tokenizer.pad_token_id = 50256
tokenizer.pad_token = "<|endoftext|>" tokenizer.pad_token = "<|endoftext|>"
tokenizer.save_pretrained(self.tmpdirname) tokenizer.save_pretrained(cls.tmpdirname)
def test_convert_token_and_id(self): def test_convert_token_and_id(self):
"""Test ``_convert_token_to_id`` and ``_convert_id_to_token``.""" """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""

View File

@ -37,12 +37,13 @@ class XGLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = True test_rust_tokenizer = True
test_sentencepiece = True test_sentencepiece = True
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
# We have a SentencePiece fixture for testing # We have a SentencePiece fixture for testing
tokenizer = XGLMTokenizer(SAMPLE_VOCAB, keep_accents=True) tokenizer = XGLMTokenizer(SAMPLE_VOCAB, keep_accents=True)
tokenizer.save_pretrained(self.tmpdirname) tokenizer.save_pretrained(cls.tmpdirname)
def test_convert_token_and_id(self): def test_convert_token_and_id(self):
"""Test ``_convert_token_to_id`` and ``_convert_id_to_token``.""" """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""

View File

@ -29,8 +29,9 @@ class XLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = XLMTokenizer tokenizer_class = XLMTokenizer
test_rust_tokenizer = False test_rust_tokenizer = False
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
vocab = [ vocab = [
@ -59,11 +60,11 @@ class XLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
vocab_tokens = dict(zip(vocab, range(len(vocab)))) vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""] merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""]
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(self.vocab_file, "w") as fp: with open(cls.vocab_file, "w") as fp:
fp.write(json.dumps(vocab_tokens)) fp.write(json.dumps(vocab_tokens))
with open(self.merges_file, "w") as fp: with open(cls.merges_file, "w") as fp:
fp.write("\n".join(merges)) fp.write("\n".join(merges))
def get_input_output_texts(self, tokenizer): def get_input_output_texts(self, tokenizer):

View File

@ -37,12 +37,13 @@ class XLMRobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = True test_rust_tokenizer = True
test_sentencepiece = True test_sentencepiece = True
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
# We have a SentencePiece fixture for testing # We have a SentencePiece fixture for testing
tokenizer = XLMRobertaTokenizer(SAMPLE_VOCAB, keep_accents=True) tokenizer = XLMRobertaTokenizer(SAMPLE_VOCAB, keep_accents=True)
tokenizer.save_pretrained(self.tmpdirname) tokenizer.save_pretrained(cls.tmpdirname)
def test_convert_token_and_id(self): def test_convert_token_and_id(self):
"""Test ``_convert_token_to_id`` and ``_convert_id_to_token``.""" """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
@ -148,8 +149,8 @@ class XLMRobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.tokenizers_list[0] = (self.rust_tokenizer_class, "hf-internal-testing/tiny-xlm-roberta", {}) self.tokenizers_list[0] = (self.rust_tokenizer_class, "hf-internal-testing/tiny-xlm-roberta", {})
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
tmpdirname2 = tempfile.mkdtemp() tmpdirname2 = tempfile.mkdtemp()

View File

@ -33,12 +33,13 @@ class XLNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = True test_rust_tokenizer = True
test_sentencepiece = True test_sentencepiece = True
def setUp(self): @classmethod
super().setUp() def setUpClass(cls):
super().setUpClass()
# We have a SentencePiece fixture for testing # We have a SentencePiece fixture for testing
tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True) tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
tokenizer.save_pretrained(self.tmpdirname) tokenizer.save_pretrained(cls.tmpdirname)
def test_convert_token_and_id(self): def test_convert_token_and_id(self):
"""Test ``_convert_token_to_id`` and ``_convert_id_to_token``.""" """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""

View File

@ -13,6 +13,8 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import copy
import functools
import inspect import inspect
import itertools import itertools
import json import json
@ -24,6 +26,7 @@ import tempfile
import traceback import traceback
import unittest import unittest
from collections import OrderedDict from collections import OrderedDict
from functools import lru_cache
from itertools import takewhile from itertools import takewhile
from pathlib import Path from pathlib import Path
from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Union from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Union
@ -69,6 +72,38 @@ if TYPE_CHECKING:
from transformers import PretrainedConfig, PreTrainedModel, TFPreTrainedModel from transformers import PretrainedConfig, PreTrainedModel, TFPreTrainedModel
def use_cache_if_possible(func):
@functools.wraps(func)
def wrapper(*args, **kwargs):
use_cache = kwargs.pop("use_cache", True)
underline_func = func
if "functools" in str(func):
underline_func = func.__wrapped__
if not use_cache:
return underline_func(*args, **kwargs)
if any(not arg.__hash__ for arg in args):
return underline_func(*args, **kwargs)
elif any(not kwarg.__hash__ for kwarg in kwargs.values()):
return underline_func(*args, **kwargs)
cached = func(*args, **kwargs)
copied = copy.deepcopy(cached)
if hasattr(copied, "_tokenizer") and "tests.models.clip.test_tokenization_clip.CLIPTokenizationTest" in str(
args[0]
):
copied._tokenizer = cached._tokenizer
if hasattr(copied, "sp_model"):
copied.sp_model = cached.sp_model
return copied
return wrapper
logger = logging.get_logger(__name__) logger = logging.get_logger(__name__)
NON_ENGLISH_TAGS = ["chinese", "dutch", "french", "finnish", "german", "multilingual"] NON_ENGLISH_TAGS = ["chinese", "dutch", "french", "finnish", "german", "multilingual"]
@ -198,32 +233,34 @@ class TokenizerTesterMixin:
# test_sentencepiece must also be set to True # test_sentencepiece must also be set to True
test_sentencepiece_ignore_case = False test_sentencepiece_ignore_case = False
def setUp(self) -> None: @classmethod
def setUpClass(cls) -> None:
# Tokenizer.filter makes it possible to filter which Tokenizer to case based on all the # Tokenizer.filter makes it possible to filter which Tokenizer to case based on all the
# information available in Tokenizer (name, rust class, python class, vocab key name) # information available in Tokenizer (name, rust class, python class, vocab key name)
self.from_pretrained_id = ( cls.from_pretrained_id = (
[self.from_pretrained_id] if isinstance(self.from_pretrained_id, str) else self.from_pretrained_id [cls.from_pretrained_id] if isinstance(cls.from_pretrained_id, str) else cls.from_pretrained_id
) )
self.tokenizers_list = [] cls.tokenizers_list = []
if self.test_rust_tokenizer: if cls.test_rust_tokenizer:
self.tokenizers_list = [ cls.tokenizers_list = [
( (
self.rust_tokenizer_class, cls.rust_tokenizer_class,
pretrained_id, pretrained_id,
self.from_pretrained_kwargs if self.from_pretrained_kwargs is not None else {}, cls.from_pretrained_kwargs if cls.from_pretrained_kwargs is not None else {},
) )
for pretrained_id in self.from_pretrained_id for pretrained_id in cls.from_pretrained_id
] ]
else: else:
self.tokenizers_list = [] cls.tokenizers_list = []
with open(f"{get_tests_dir()}/fixtures/sample_text.txt", encoding="utf-8") as f_data: with open(f"{get_tests_dir()}/fixtures/sample_text.txt", encoding="utf-8") as f_data:
self._data = f_data.read().replace("\n\n", "\n").strip() cls._data = f_data.read().replace("\n\n", "\n").strip()
self.tmpdirname = tempfile.mkdtemp() cls.tmpdirname = tempfile.mkdtemp()
def tearDown(self): @classmethod
shutil.rmtree(self.tmpdirname) def tearDownClass(cls):
shutil.rmtree(cls.tmpdirname)
def get_input_output_texts(self, tokenizer): def get_input_output_texts(self, tokenizer):
input_txt = self.get_clean_sequence(tokenizer)[0] input_txt = self.get_clean_sequence(tokenizer)[0]
@ -267,11 +304,19 @@ class TokenizerTesterMixin:
else: else:
raise ValueError("This tokenizer class has no tokenizer to be tested.") raise ValueError("This tokenizer class has no tokenizer to be tested.")
def get_tokenizer(self, **kwargs) -> PreTrainedTokenizer: @classmethod
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) @use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs) -> PreTrainedTokenizer:
pretrained_name = pretrained_name or cls.tmpdirname
return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
def get_rust_tokenizer(self, **kwargs) -> PreTrainedTokenizerFast: @classmethod
return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) @use_cache_if_possible
@lru_cache(maxsize=64)
def get_rust_tokenizer(cls, pretrained_name=None, **kwargs) -> PreTrainedTokenizerFast:
pretrained_name = pretrained_name or cls.tmpdirname
return cls.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
def tokenizer_integration_test_util( def tokenizer_integration_test_util(
self, self,
@ -1263,7 +1308,7 @@ class TokenizerTesterMixin:
if not self.test_rust_tokenizer: if not self.test_rust_tokenizer:
self.skipTest(reason="No fast tokenizer defined") self.skipTest(reason="No fast tokenizer defined")
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name) tokenizer_r = self.get_rust_tokenizer(pretrained_name)
self._check_no_pad_token_padding(tokenizer_r, conversations) self._check_no_pad_token_padding(tokenizer_r, conversations)
tokenizer_r.padding_side = "right" tokenizer_r.padding_side = "right"
@ -1446,7 +1491,7 @@ class TokenizerTesterMixin:
if not self.test_rust_tokenizer: if not self.test_rust_tokenizer:
self.skipTest(reason="No fast tokenizer defined") self.skipTest(reason="No fast tokenizer defined")
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name) tokenizer_r = self.get_rust_tokenizer(pretrained_name)
# Find where to truncate, as the amount of tokens is different for different tokenizers and I want the # Find where to truncate, as the amount of tokens is different for different tokenizers and I want the
# truncation to happen in the middle of the assistant content. # truncation to happen in the middle of the assistant content.
@ -2050,11 +2095,9 @@ class TokenizerTesterMixin:
if self.rust_tokenizer_class is not None: if self.rust_tokenizer_class is not None:
pretrained_name = self.from_pretrained_id pretrained_name = self.from_pretrained_id
slow_tokenizer = self.tokenizer_class.from_pretrained(pretrained_name, legacy=False) slow_tokenizer = self.get_tokenizer(pretrained_name, legacy=False)
with self.subTest(f"{pretrained_name}"): with self.subTest(f"{pretrained_name}"):
rust_tokenizer = self.rust_tokenizer_class.from_pretrained( rust_tokenizer = self.get_rust_tokenizer(pretrained_name, from_slow=True, legacy=False)
pretrained_name, from_slow=True, legacy=False
)
input_full_vocab_ids = list( input_full_vocab_ids = list(
range(len(slow_tokenizer)) range(len(slow_tokenizer))
) # TODO let's maybe shuffle this! And run it 4 times. This way we cover more cmbinations ) # TODO let's maybe shuffle this! And run it 4 times. This way we cover more cmbinations
@ -2200,14 +2243,10 @@ class TokenizerTesterMixin:
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
if self.test_rust_tokenizer: if self.test_rust_tokenizer:
tokenizer_r = self.rust_tokenizer_class.from_pretrained( tokenizer_r = self.get_rust_tokenizer(pretrained_name, padding_side="left", **kwargs)
pretrained_name, padding_side="left", **kwargs
)
self.assertEqual(tokenizer_r.padding_side, "left") self.assertEqual(tokenizer_r.padding_side, "left")
tokenizer_r = self.rust_tokenizer_class.from_pretrained( tokenizer_r = self.get_rust_tokenizer(pretrained_name, padding_side="right", **kwargs)
pretrained_name, padding_side="right", **kwargs
)
self.assertEqual(tokenizer_r.padding_side, "right") self.assertEqual(tokenizer_r.padding_side, "right")
self.assertRaises( self.assertRaises(
@ -2219,10 +2258,10 @@ class TokenizerTesterMixin:
) )
if self.test_slow_tokenizer: if self.test_slow_tokenizer:
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, padding_side="left", **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, padding_side="left", **kwargs)
self.assertEqual(tokenizer_p.padding_side, "left") self.assertEqual(tokenizer_p.padding_side, "left")
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, padding_side="right", **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, padding_side="right", **kwargs)
self.assertEqual(tokenizer_p.padding_side, "right") self.assertEqual(tokenizer_p.padding_side, "right")
self.assertRaises( self.assertRaises(
@ -2237,14 +2276,10 @@ class TokenizerTesterMixin:
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
if self.test_rust_tokenizer: if self.test_rust_tokenizer:
tokenizer_r = self.rust_tokenizer_class.from_pretrained( tokenizer_r = self.get_rust_tokenizer(pretrained_name, truncation_side="left", **kwargs)
pretrained_name, truncation_side="left", **kwargs
)
self.assertEqual(tokenizer_r.truncation_side, "left") self.assertEqual(tokenizer_r.truncation_side, "left")
tokenizer_r = self.rust_tokenizer_class.from_pretrained( tokenizer_r = self.get_rust_tokenizer(pretrained_name, truncation_side="right", **kwargs)
pretrained_name, truncation_side="right", **kwargs
)
self.assertEqual(tokenizer_r.truncation_side, "right") self.assertEqual(tokenizer_r.truncation_side, "right")
self.assertRaises( self.assertRaises(
@ -2256,14 +2291,10 @@ class TokenizerTesterMixin:
) )
if self.test_slow_tokenizer: if self.test_slow_tokenizer:
tokenizer_p = self.tokenizer_class.from_pretrained( tokenizer_p = self.get_tokenizer(pretrained_name, truncation_side="left", **kwargs)
pretrained_name, truncation_side="left", **kwargs
)
self.assertEqual(tokenizer_p.truncation_side, "left") self.assertEqual(tokenizer_p.truncation_side, "left")
tokenizer_p = self.tokenizer_class.from_pretrained( tokenizer_p = self.get_tokenizer(pretrained_name, truncation_side="right", **kwargs)
pretrained_name, truncation_side="right", **kwargs
)
self.assertEqual(tokenizer_p.truncation_side, "right") self.assertEqual(tokenizer_p.truncation_side, "right")
self.assertRaises( self.assertRaises(
@ -3194,18 +3225,18 @@ class TokenizerTesterMixin:
def test_is_fast(self): def test_is_fast(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
# Check is_fast is set correctly # Check is_fast is set correctly
self.assertTrue(tokenizer_r.is_fast) self.assertTrue(tokenizer_r.is_fast)
if self.test_slow_tokenizer: if self.test_slow_tokenizer:
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
self.assertFalse(tokenizer_p.is_fast) self.assertFalse(tokenizer_p.is_fast)
def test_fast_only_inputs(self): def test_fast_only_inputs(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
# Ensure None raise an error # Ensure None raise an error
self.assertRaises(TypeError, tokenizer_r.tokenize, None) self.assertRaises(TypeError, tokenizer_r.tokenize, None)
@ -3216,7 +3247,7 @@ class TokenizerTesterMixin:
def test_alignement_methods(self): def test_alignement_methods(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
words = ["Wonderful", "no", "inspiration", "example", "with", "subtoken"] words = ["Wonderful", "no", "inspiration", "example", "with", "subtoken"]
text = " ".join(words) text = " ".join(words)
@ -3446,8 +3477,8 @@ class TokenizerTesterMixin:
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
# Ensure basic input match # Ensure basic input match
input_p = tokenizer_p.encode_plus(self._data) input_p = tokenizer_p.encode_plus(self._data)
@ -3487,8 +3518,8 @@ class TokenizerTesterMixin:
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
# Check we have the same number of added_tokens for both pair and non-pair inputs. # Check we have the same number of added_tokens for both pair and non-pair inputs.
self.assertEqual( self.assertEqual(
@ -3505,8 +3536,8 @@ class TokenizerTesterMixin:
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
# Check we have the correct max_length for both pair and non-pair inputs. # Check we have the correct max_length for both pair and non-pair inputs.
self.assertEqual(tokenizer_r.max_len_single_sentence, tokenizer_p.max_len_single_sentence) self.assertEqual(tokenizer_r.max_len_single_sentence, tokenizer_p.max_len_single_sentence)
@ -3520,8 +3551,8 @@ class TokenizerTesterMixin:
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
# sometimes the tokenizer saved online is not the same # sometimes the tokenizer saved online is not the same
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
# Assert the set of special tokens match. # Assert the set of special tokens match.
self.assertSequenceEqual( self.assertSequenceEqual(
@ -3532,7 +3563,7 @@ class TokenizerTesterMixin:
def test_add_tokens(self): def test_add_tokens(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
vocab_size = len(tokenizer_r) vocab_size = len(tokenizer_r)
self.assertEqual(tokenizer_r.add_tokens(""), 0) self.assertEqual(tokenizer_r.add_tokens(""), 0)
@ -3558,7 +3589,7 @@ class TokenizerTesterMixin:
def test_offsets_mapping(self): def test_offsets_mapping(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
text = "Wonderful no inspiration example with subtoken" text = "Wonderful no inspiration example with subtoken"
pair = "Along with an awesome pair" pair = "Along with an awesome pair"
@ -3601,7 +3632,7 @@ class TokenizerTesterMixin:
This needs to be padded so that it can represented as a tensor This needs to be padded so that it can represented as a tensor
""" """
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
tokenizer = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer = self.get_rust_tokenizer(pretrained_name, **kwargs)
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name}, {tokenizer.__class__.__name__})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name}, {tokenizer.__class__.__name__})"):
if is_torch_available(): if is_torch_available():
@ -3663,8 +3694,8 @@ class TokenizerTesterMixin:
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
if hasattr(tokenizer_p, "add_prefix_space") and not tokenizer_p.add_prefix_space: if hasattr(tokenizer_p, "add_prefix_space") and not tokenizer_p.add_prefix_space:
continue # Too hard to test for now continue # Too hard to test for now
@ -3745,8 +3776,8 @@ class TokenizerTesterMixin:
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
input_simple = [1, 2, 3] input_simple = [1, 2, 3]
input_pair = [1, 2, 3] input_pair = [1, 2, 3]
@ -3767,8 +3798,8 @@ class TokenizerTesterMixin:
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
# # Input string # # Input string
# input_simple = tokenizer_p.tokenize("This is a sample input", add_special_tokens=False) # input_simple = tokenizer_p.tokenize("This is a sample input", add_special_tokens=False)
# input_pair = tokenizer_p.tokenize("This is a sample pair", add_special_tokens=False) # input_pair = tokenizer_p.tokenize("This is a sample pair", add_special_tokens=False)
@ -3812,8 +3843,8 @@ class TokenizerTesterMixin:
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id) self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
pad_token_id = tokenizer_p.pad_token_id pad_token_id = tokenizer_p.pad_token_id
@ -4038,8 +4069,8 @@ class TokenizerTesterMixin:
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id) self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
pad_token_id = tokenizer_p.pad_token_id pad_token_id = tokenizer_p.pad_token_id
@ -4076,8 +4107,8 @@ class TokenizerTesterMixin:
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
tmpdirname2 = tempfile.mkdtemp() tmpdirname2 = tempfile.mkdtemp()
@ -4151,8 +4182,8 @@ class TokenizerTesterMixin:
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
sentence = "A, <mask> AllenNLP sentence." sentence = "A, <mask> AllenNLP sentence."
tokens_r = tokenizer_r.encode_plus( tokens_r = tokenizer_r.encode_plus(
sentence, sentence,
@ -4176,7 +4207,7 @@ class TokenizerTesterMixin:
def test_compare_add_special_tokens(self): def test_compare_add_special_tokens(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
simple_num_special_tokens_to_add = tokenizer_r.num_special_tokens_to_add(pair=False) simple_num_special_tokens_to_add = tokenizer_r.num_special_tokens_to_add(pair=False)
# pair_num_special_tokens_to_add = tokenizer_r.num_special_tokens_to_add(pair=True) # pair_num_special_tokens_to_add = tokenizer_r.num_special_tokens_to_add(pair=True)
@ -4219,8 +4250,8 @@ class TokenizerTesterMixin:
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
string_sequence = "Asserting that both tokenizers are equal" string_sequence = "Asserting that both tokenizers are equal"
python_output = tokenizer_p.prepare_for_model( python_output = tokenizer_p.prepare_for_model(
tokenizer_p.encode(string_sequence, add_special_tokens=False) tokenizer_p.encode(string_sequence, add_special_tokens=False)
@ -4235,7 +4266,7 @@ class TokenizerTesterMixin:
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
added_tokens = [AddedToken("<special>", lstrip=True)] added_tokens = [AddedToken("<special>", lstrip=True)]
tokenizer_r = self.rust_tokenizer_class.from_pretrained( tokenizer_r = self.get_rust_tokenizer(
pretrained_name, additional_special_tokens=added_tokens, **kwargs pretrained_name, additional_special_tokens=added_tokens, **kwargs
) )
r_output = tokenizer_r.encode("Hey this is a <special> token") r_output = tokenizer_r.encode("Hey this is a <special> token")
@ -4246,12 +4277,10 @@ class TokenizerTesterMixin:
if self.test_slow_tokenizer: if self.test_slow_tokenizer:
# in rust fast, you lose the information of the AddedToken when initializing with `additional_special_tokens` # in rust fast, you lose the information of the AddedToken when initializing with `additional_special_tokens`
tokenizer_cr = self.rust_tokenizer_class.from_pretrained( tokenizer_cr = self.get_rust_tokenizer(
pretrained_name, additional_special_tokens=added_tokens, **kwargs, from_slow=True pretrained_name, additional_special_tokens=added_tokens, **kwargs, from_slow=True
) )
tokenizer_p = self.tokenizer_class.from_pretrained( tokenizer_p = self.get_tokenizer(pretrained_name, additional_special_tokens=added_tokens, **kwargs)
pretrained_name, additional_special_tokens=added_tokens, **kwargs
)
p_output = tokenizer_p.encode("Hey this is a <special> token") p_output = tokenizer_p.encode("Hey this is a <special> token")
@ -4498,7 +4527,7 @@ class TokenizerTesterMixin:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
with tempfile.TemporaryDirectory() as tmp_dir: with tempfile.TemporaryDirectory() as tmp_dir:
# Save the fast tokenizer files in a temporary directory # Save the fast tokenizer files in a temporary directory
tokenizer_old = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs, use_fast=True) tokenizer_old = self.get_rust_tokenizer(pretrained_name, **kwargs, use_fast=True)
tokenizer_old.save_pretrained(tmp_dir, legacy_format=False) # save only fast version tokenizer_old.save_pretrained(tmp_dir, legacy_format=False) # save only fast version
# Initialize toy model for the trainer # Initialize toy model for the trainer
@ -4532,13 +4561,11 @@ class TokenizerTesterMixin:
with tempfile.TemporaryDirectory() as tmp_dir_1: with tempfile.TemporaryDirectory() as tmp_dir_1:
# Here we check that even if we have initialized a fast tokenizer with a tokenizer_file we can # Here we check that even if we have initialized a fast tokenizer with a tokenizer_file we can
# still save only the slow version and use these saved files to rebuild a tokenizer # still save only the slow version and use these saved files to rebuild a tokenizer
tokenizer_fast_old_1 = self.rust_tokenizer_class.from_pretrained( tokenizer_fast_old_1 = self.get_rust_tokenizer(pretrained_name, **kwargs, use_fast=True)
pretrained_name, **kwargs, use_fast=True
)
tokenizer_file = os.path.join(tmp_dir_1, "tokenizer.json") tokenizer_file = os.path.join(tmp_dir_1, "tokenizer.json")
tokenizer_fast_old_1.backend_tokenizer.save(tokenizer_file) tokenizer_fast_old_1.backend_tokenizer.save(tokenizer_file)
tokenizer_fast_old_2 = self.rust_tokenizer_class.from_pretrained( tokenizer_fast_old_2 = self.get_rust_tokenizer(
pretrained_name, **kwargs, use_fast=True, tokenizer_file=tokenizer_file pretrained_name, **kwargs, use_fast=True, tokenizer_file=tokenizer_file
) )
@ -4560,10 +4587,10 @@ class TokenizerTesterMixin:
special_token = "<my_new_token>" special_token = "<my_new_token>"
special_sentence = f"Hey this is a {special_token} token" special_sentence = f"Hey this is a {special_token} token"
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_rust = self.rust_tokenizer_class.from_pretrained( tokenizer_rust = self.get_rust_tokenizer(
pretrained_name, additional_special_tokens=[special_token], split_special_tokens=True, **kwargs pretrained_name, additional_special_tokens=[special_token], split_special_tokens=True, **kwargs
) )
tokenizer_py = self.tokenizer_class.from_pretrained( tokenizer_py = self.get_tokenizer(
pretrained_name, additional_special_tokens=[special_token], split_special_tokens=True, **kwargs pretrained_name, additional_special_tokens=[special_token], split_special_tokens=True, **kwargs
) )
@ -4622,7 +4649,7 @@ class TokenizerTesterMixin:
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
# Load a slow tokenizer from the hub, init with the new token for fast to also include it # Load a slow tokenizer from the hub, init with the new token for fast to also include it
tokenizer = self.tokenizer_class.from_pretrained(pretrained_name, eos_token=new_eos) tokenizer = self.get_tokenizer(pretrained_name, eos_token=new_eos)
EXPECTED_ADDED_TOKENS_DECODER = tokenizer.added_tokens_decoder EXPECTED_ADDED_TOKENS_DECODER = tokenizer.added_tokens_decoder
with self.subTest("Hub -> Slow: Test loading a slow tokenizer from the hub)"): with self.subTest("Hub -> Slow: Test loading a slow tokenizer from the hub)"):
self.assertEqual(tokenizer._special_tokens_map["eos_token"], new_eos) self.assertEqual(tokenizer._special_tokens_map["eos_token"], new_eos)
@ -4662,7 +4689,7 @@ class TokenizerTesterMixin:
with self.subTest("Hub -> Fast: Test loading a fast tokenizer from the hub)"): with self.subTest("Hub -> Fast: Test loading a fast tokenizer from the hub)"):
if self.rust_tokenizer_class is not None: if self.rust_tokenizer_class is not None:
tokenizer_fast = self.rust_tokenizer_class.from_pretrained(pretrained_name, eos_token=new_eos) tokenizer_fast = self.get_rust_tokenizer(pretrained_name, eos_token=new_eos)
self.assertEqual(tokenizer_fast._special_tokens_map["eos_token"], new_eos) self.assertEqual(tokenizer_fast._special_tokens_map["eos_token"], new_eos)
self.assertIn(new_eos, list(tokenizer_fast.added_tokens_decoder.values())) self.assertIn(new_eos, list(tokenizer_fast.added_tokens_decoder.values()))
# We can't test the following because for BC we kept the default rstrip lstrip in slow not fast. Will comment once normalization is alright # We can't test the following because for BC we kept the default rstrip lstrip in slow not fast. Will comment once normalization is alright

View File

@ -33,19 +33,20 @@ class PreTrainedTokenizationFastTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = True test_rust_tokenizer = True
from_pretrained_vocab_key = "tokenizer_file" from_pretrained_vocab_key = "tokenizer_file"
def setUp(self): @classmethod
self.test_rust_tokenizer = False # because we don't have pretrained_vocab_files_map def setUpClass(cls):
super().setUp() cls.test_rust_tokenizer = False # because we don't have pretrained_vocab_files_map
self.test_rust_tokenizer = True super().setUpClass()
cls.test_rust_tokenizer = True
model_paths = ["robot-test/dummy-tokenizer-fast", "robot-test/dummy-tokenizer-wordlevel"] model_paths = ["robot-test/dummy-tokenizer-fast", "robot-test/dummy-tokenizer-wordlevel"]
self.bytelevel_bpe_model_name = "SaulLu/dummy-tokenizer-bytelevel-bpe" cls.bytelevel_bpe_model_name = "SaulLu/dummy-tokenizer-bytelevel-bpe"
# Inclusion of 2 tokenizers to test different types of models (Unigram and WordLevel for the moment) # Inclusion of 2 tokenizers to test different types of models (Unigram and WordLevel for the moment)
self.tokenizers_list = [(PreTrainedTokenizerFast, model_path, {}) for model_path in model_paths] cls.tokenizers_list = [(PreTrainedTokenizerFast, model_path, {}) for model_path in model_paths]
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_paths[0]) tokenizer = PreTrainedTokenizerFast.from_pretrained(model_paths[0])
tokenizer.save_pretrained(self.tmpdirname) tokenizer.save_pretrained(cls.tmpdirname)
@unittest.skip( @unittest.skip(
"We disable this test for PreTrainedTokenizerFast because it is the only tokenizer that is not linked to any model" "We disable this test for PreTrainedTokenizerFast because it is the only tokenizer that is not linked to any model"