diff --git a/tests/models/albert/test_tokenization_albert.py b/tests/models/albert/test_tokenization_albert.py index beb910b9d15..73022d5b02d 100644 --- a/tests/models/albert/test_tokenization_albert.py +++ b/tests/models/albert/test_tokenization_albert.py @@ -34,12 +34,13 @@ class AlbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): test_sentencepiece = True test_sentencepiece_ignore_case = True - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() # We have a SentencePiece fixture for testing tokenizer = AlbertTokenizer(SAMPLE_VOCAB) - tokenizer.save_pretrained(self.tmpdirname) + tokenizer.save_pretrained(cls.tmpdirname) def get_input_output_texts(self, tokenizer): input_text = "this is a test" diff --git a/tests/models/bart/test_tokenization_bart.py b/tests/models/bart/test_tokenization_bart.py index 274312983f1..f6b66982cc8 100644 --- a/tests/models/bart/test_tokenization_bart.py +++ b/tests/models/bart/test_tokenization_bart.py @@ -14,13 +14,14 @@ import json import os import unittest +from functools import lru_cache from transformers import BartTokenizer, BartTokenizerFast, BatchEncoding from transformers.models.roberta.tokenization_roberta import VOCAB_FILES_NAMES from transformers.testing_utils import require_tokenizers, require_torch from transformers.utils import cached_property -from ...test_tokenization_common import TokenizerTesterMixin, filter_roberta_detectors +from ...test_tokenization_common import TokenizerTesterMixin, filter_roberta_detectors, use_cache_if_possible @require_tokenizers @@ -32,8 +33,10 @@ class TestTokenizationBart(TokenizerTesterMixin, unittest.TestCase): from_pretrained_filter = filter_roberta_detectors # from_pretrained_kwargs = {'add_prefix_space': True} - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() + vocab = [ "l", "o", @@ -58,22 +61,30 @@ class TestTokenizationBart(TokenizerTesterMixin, unittest.TestCase): ] vocab_tokens = dict(zip(vocab, range(len(vocab)))) merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""] - self.special_tokens_map = {"unk_token": ""} + cls.special_tokens_map = {"unk_token": ""} - self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) - self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) - with open(self.vocab_file, "w", encoding="utf-8") as fp: + cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) + with open(cls.vocab_file, "w", encoding="utf-8") as fp: fp.write(json.dumps(vocab_tokens) + "\n") - with open(self.merges_file, "w", encoding="utf-8") as fp: + with open(cls.merges_file, "w", encoding="utf-8") as fp: fp.write("\n".join(merges)) - def get_tokenizer(self, **kwargs): - kwargs.update(self.special_tokens_map) - return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_tokenizer(cls, pretrained_name=None, **kwargs): + kwargs.update(cls.special_tokens_map) + pretrained_name = pretrained_name or cls.tmpdirname + return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs) - def get_rust_tokenizer(self, **kwargs): - kwargs.update(self.special_tokens_map) - return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_rust_tokenizer(cls, pretrained_name=None, **kwargs): + kwargs.update(cls.special_tokens_map) + pretrained_name = pretrained_name or cls.tmpdirname + return cls.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) def get_input_output_texts(self, tokenizer): return "lower newer", "lower newer" @@ -154,8 +165,8 @@ class TestTokenizationBart(TokenizerTesterMixin, unittest.TestCase): def test_embeded_special_tokens(self): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) sentence = "A, AllenNLP sentence." tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True) tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True) diff --git a/tests/models/barthez/test_tokenization_barthez.py b/tests/models/barthez/test_tokenization_barthez.py index c76435958c6..86663ce60c1 100644 --- a/tests/models/barthez/test_tokenization_barthez.py +++ b/tests/models/barthez/test_tokenization_barthez.py @@ -31,13 +31,14 @@ class BarthezTokenizationTest(TokenizerTesterMixin, unittest.TestCase): test_rust_tokenizer = True test_sentencepiece = True - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() tokenizer = BarthezTokenizerFast.from_pretrained("moussaKam/mbarthez") - tokenizer.save_pretrained(self.tmpdirname) - tokenizer.save_pretrained(self.tmpdirname, legacy_format=False) - self.tokenizer = tokenizer + tokenizer.save_pretrained(cls.tmpdirname) + tokenizer.save_pretrained(cls.tmpdirname, legacy_format=False) + cls.tokenizer = tokenizer def test_convert_token_and_id(self): """Test ``_convert_token_to_id`` and ``_convert_id_to_token``.""" diff --git a/tests/models/bartpho/test_tokenization_bartpho.py b/tests/models/bartpho/test_tokenization_bartpho.py index 023584e91f8..6eb05a17acc 100644 --- a/tests/models/bartpho/test_tokenization_bartpho.py +++ b/tests/models/bartpho/test_tokenization_bartpho.py @@ -15,11 +15,12 @@ import os import unittest +from functools import lru_cache from transformers.models.bartpho.tokenization_bartpho import VOCAB_FILES_NAMES, BartphoTokenizer from transformers.testing_utils import get_tests_dir -from ...test_tokenization_common import TokenizerTesterMixin +from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece_bpe.model") @@ -31,24 +32,29 @@ class BartphoTokenizerTest(TokenizerTesterMixin, unittest.TestCase): test_rust_tokenizer = False test_sentencepiece = True - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() vocab = ["▁This", "▁is", "▁a", "▁t", "est"] vocab_tokens = dict(zip(vocab, range(len(vocab)))) - self.special_tokens_map = {"unk_token": ""} + cls.special_tokens_map = {"unk_token": ""} - self.monolingual_vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["monolingual_vocab_file"]) - with open(self.monolingual_vocab_file, "w", encoding="utf-8") as fp: + cls.monolingual_vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["monolingual_vocab_file"]) + with open(cls.monolingual_vocab_file, "w", encoding="utf-8") as fp: for token in vocab_tokens: fp.write(f"{token} {vocab_tokens[token]}\n") - tokenizer = BartphoTokenizer(SAMPLE_VOCAB, self.monolingual_vocab_file, **self.special_tokens_map) - tokenizer.save_pretrained(self.tmpdirname) + tokenizer = BartphoTokenizer(SAMPLE_VOCAB, cls.monolingual_vocab_file, **cls.special_tokens_map) + tokenizer.save_pretrained(cls.tmpdirname) - def get_tokenizer(self, **kwargs): - kwargs.update(self.special_tokens_map) - return BartphoTokenizer.from_pretrained(self.tmpdirname, **kwargs) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_tokenizer(cls, pretrained_name=None, **kwargs): + kwargs.update(cls.special_tokens_map) + pretrained_name = pretrained_name or cls.tmpdirname + return BartphoTokenizer.from_pretrained(pretrained_name, **kwargs) def get_input_output_texts(self, tokenizer): input_text = "This is a là test" diff --git a/tests/models/bert/test_tokenization_bert.py b/tests/models/bert/test_tokenization_bert.py index 747b0cf2a73..c4392b306b4 100644 --- a/tests/models/bert/test_tokenization_bert.py +++ b/tests/models/bert/test_tokenization_bert.py @@ -41,8 +41,9 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): space_between_special_tokens = True from_pretrained_filter = filter_non_english - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() vocab_tokens = [ "[UNK]", @@ -61,8 +62,8 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): "low", "lowest", ] - self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) - with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: + cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer: vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) def get_input_output_texts(self, tokenizer): @@ -257,7 +258,7 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): def test_offsets_with_special_characters(self): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence." tokens = tokenizer_r.encode_plus( @@ -312,8 +313,8 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): kwargs["tokenize_chinese_chars"] = True - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False) ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False) @@ -326,8 +327,8 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): self.assertListEqual(tokens_without_spe_char_r, list_of_commun_chinese_char) kwargs["tokenize_chinese_chars"] = False - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False) ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False) diff --git a/tests/models/bert_generation/test_tokenization_bert_generation.py b/tests/models/bert_generation/test_tokenization_bert_generation.py index e1ccfba8f4e..1569932d715 100644 --- a/tests/models/bert_generation/test_tokenization_bert_generation.py +++ b/tests/models/bert_generation/test_tokenization_bert_generation.py @@ -34,11 +34,12 @@ class BertGenerationTokenizationTest(TokenizerTesterMixin, unittest.TestCase): test_rust_tokenizer = False test_sentencepiece = True - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() tokenizer = BertGenerationTokenizer(SAMPLE_VOCAB, keep_accents=True) - tokenizer.save_pretrained(self.tmpdirname) + tokenizer.save_pretrained(cls.tmpdirname) def test_convert_token_and_id(self): """Test ``_convert_token_to_id`` and ``_convert_id_to_token``.""" diff --git a/tests/models/bert_japanese/test_tokenization_bert_japanese.py b/tests/models/bert_japanese/test_tokenization_bert_japanese.py index d4954c96522..73020e70527 100644 --- a/tests/models/bert_japanese/test_tokenization_bert_japanese.py +++ b/tests/models/bert_japanese/test_tokenization_bert_japanese.py @@ -17,6 +17,7 @@ import os import pickle import unittest +from functools import lru_cache from transformers import AutoTokenizer from transformers.models.bert.tokenization_bert import BertTokenizer @@ -31,7 +32,7 @@ from transformers.models.bert_japanese.tokenization_bert_japanese import ( ) from transformers.testing_utils import custom_tokenizers, require_jumanpp, require_sudachi_projection -from ...test_tokenization_common import TokenizerTesterMixin +from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible @custom_tokenizers @@ -41,8 +42,9 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase): test_rust_tokenizer = False space_between_special_tokens = True - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() vocab_tokens = [ "[UNK]", @@ -72,8 +74,8 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase): "です", ] - self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) - with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: + cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer: vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) def get_input_output_texts(self, tokenizer): @@ -408,17 +410,21 @@ class BertJapaneseCharacterTokenizationTest(TokenizerTesterMixin, unittest.TestC tokenizer_class = BertJapaneseTokenizer test_rust_tokenizer = False - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "こ", "ん", "に", "ち", "は", "ば", "世", "界", "、", "。"] - self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) - with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: + cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer: vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) - def get_tokenizer(self, **kwargs): - return BertJapaneseTokenizer.from_pretrained(self.tmpdirname, subword_tokenizer_type="character", **kwargs) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_tokenizer(cls, pretrained_name=None, **kwargs): + return BertJapaneseTokenizer.from_pretrained(cls.tmpdirname, subword_tokenizer_type="character", **kwargs) def get_input_output_texts(self, tokenizer): input_text = "こんにちは、世界。 \nこんばんは、世界。" diff --git a/tests/models/bertweet/test_tokenization_bertweet.py b/tests/models/bertweet/test_tokenization_bertweet.py index 71e0a0afe5b..d0659bc95af 100644 --- a/tests/models/bertweet/test_tokenization_bertweet.py +++ b/tests/models/bertweet/test_tokenization_bertweet.py @@ -15,10 +15,11 @@ import os import unittest +from functools import lru_cache from transformers.models.bertweet.tokenization_bertweet import VOCAB_FILES_NAMES, BertweetTokenizer -from ...test_tokenization_common import TokenizerTesterMixin +from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible class BertweetTokenizationTest(TokenizerTesterMixin, unittest.TestCase): @@ -26,26 +27,31 @@ class BertweetTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = BertweetTokenizer test_rust_tokenizer = False - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt vocab = ["I", "m", "V@@", "R@@", "r", "e@@"] vocab_tokens = dict(zip(vocab, range(len(vocab)))) merges = ["#version: 0.2", "a m"] - self.special_tokens_map = {"unk_token": ""} + cls.special_tokens_map = {"unk_token": ""} - self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) - self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) - with open(self.vocab_file, "w", encoding="utf-8") as fp: + cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) + with open(cls.vocab_file, "w", encoding="utf-8") as fp: for token in vocab_tokens: fp.write(f"{token} {vocab_tokens[token]}\n") - with open(self.merges_file, "w", encoding="utf-8") as fp: + with open(cls.merges_file, "w", encoding="utf-8") as fp: fp.write("\n".join(merges)) - def get_tokenizer(self, **kwargs): - kwargs.update(self.special_tokens_map) - return BertweetTokenizer.from_pretrained(self.tmpdirname, **kwargs) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_tokenizer(cls, pretrained_name=None, **kwargs): + kwargs.update(cls.special_tokens_map) + pretrained_name = pretrained_name or cls.tmpdirname + return BertweetTokenizer.from_pretrained(pretrained_name, **kwargs) def get_input_output_texts(self, tokenizer): input_text = "I am VinAI Research" diff --git a/tests/models/big_bird/test_tokenization_big_bird.py b/tests/models/big_bird/test_tokenization_big_bird.py index 25f8de17700..f8fa29ba484 100644 --- a/tests/models/big_bird/test_tokenization_big_bird.py +++ b/tests/models/big_bird/test_tokenization_big_bird.py @@ -36,11 +36,12 @@ class BigBirdTokenizationTest(TokenizerTesterMixin, unittest.TestCase): test_rust_tokenizer = True test_sentencepiece = True - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() - tokenizer = self.tokenizer_class(SAMPLE_VOCAB, keep_accents=True) - tokenizer.save_pretrained(self.tmpdirname) + tokenizer = cls.tokenizer_class(SAMPLE_VOCAB, keep_accents=True) + tokenizer.save_pretrained(cls.tmpdirname) def test_convert_token_and_id(self): """Test ``_convert_token_to_id`` and ``_convert_id_to_token``.""" diff --git a/tests/models/biogpt/test_tokenization_biogpt.py b/tests/models/biogpt/test_tokenization_biogpt.py index ea52a7cf7f3..4a9c53a6d05 100644 --- a/tests/models/biogpt/test_tokenization_biogpt.py +++ b/tests/models/biogpt/test_tokenization_biogpt.py @@ -30,8 +30,9 @@ class BioGptTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = BioGptTokenizer test_rust_tokenizer = False - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt vocab = [ @@ -60,11 +61,11 @@ class BioGptTokenizationTest(TokenizerTesterMixin, unittest.TestCase): vocab_tokens = dict(zip(vocab, range(len(vocab)))) merges = ["l o 123", "lo w 1456", "e r 1789", ""] - self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) - self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) - with open(self.vocab_file, "w") as fp: + cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) + with open(cls.vocab_file, "w") as fp: fp.write(json.dumps(vocab_tokens)) - with open(self.merges_file, "w") as fp: + with open(cls.merges_file, "w") as fp: fp.write("\n".join(merges)) def get_input_output_texts(self, tokenizer): diff --git a/tests/models/blenderbot_small/test_tokenization_blenderbot_small.py b/tests/models/blenderbot_small/test_tokenization_blenderbot_small.py index 7ee3e989fb1..286052558b9 100644 --- a/tests/models/blenderbot_small/test_tokenization_blenderbot_small.py +++ b/tests/models/blenderbot_small/test_tokenization_blenderbot_small.py @@ -18,13 +18,14 @@ import json import os import unittest +from functools import lru_cache from transformers.models.blenderbot_small.tokenization_blenderbot_small import ( VOCAB_FILES_NAMES, BlenderbotSmallTokenizer, ) -from ...test_tokenization_common import TokenizerTesterMixin +from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible class BlenderbotSmallTokenizerTest(TokenizerTesterMixin, unittest.TestCase): @@ -32,25 +33,30 @@ class BlenderbotSmallTokenizerTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = BlenderbotSmallTokenizer test_rust_tokenizer = False - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() vocab = ["__start__", "adapt", "act", "ap@@", "te", "__end__", "__unk__"] vocab_tokens = dict(zip(vocab, range(len(vocab)))) merges = ["#version: 0.2", "a p", "t e", "ap t", "a d", "ad apt", "a c", "ac t", ""] - self.special_tokens_map = {"unk_token": "__unk__", "bos_token": "__start__", "eos_token": "__end__"} + cls.special_tokens_map = {"unk_token": "__unk__", "bos_token": "__start__", "eos_token": "__end__"} - self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) - self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) - with open(self.vocab_file, "w", encoding="utf-8") as fp: + cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) + with open(cls.vocab_file, "w", encoding="utf-8") as fp: fp.write(json.dumps(vocab_tokens) + "\n") - with open(self.merges_file, "w", encoding="utf-8") as fp: + with open(cls.merges_file, "w", encoding="utf-8") as fp: fp.write("\n".join(merges)) - def get_tokenizer(self, **kwargs): - kwargs.update(self.special_tokens_map) - return BlenderbotSmallTokenizer.from_pretrained(self.tmpdirname, **kwargs) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_tokenizer(cls, pretrained_name=None, **kwargs): + kwargs.update(cls.special_tokens_map) + pretrained_name = pretrained_name or cls.tmpdirname + return BlenderbotSmallTokenizer.from_pretrained(pretrained_name, **kwargs) def get_input_output_texts(self, tokenizer): input_text = "adapt act apte" diff --git a/tests/models/bloom/test_tokenization_bloom.py b/tests/models/bloom/test_tokenization_bloom.py index 6c2fffef64f..e8e255f49c1 100644 --- a/tests/models/bloom/test_tokenization_bloom.py +++ b/tests/models/bloom/test_tokenization_bloom.py @@ -13,14 +13,16 @@ # See the License for the specific language governing permissions and # limitations under the License. +import copy import unittest +from functools import lru_cache from datasets import load_dataset from transformers import BloomTokenizerFast from transformers.testing_utils import require_jinja, require_tokenizers -from ...test_tokenization_common import TokenizerTesterMixin +from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible @require_tokenizers @@ -34,14 +36,21 @@ class BloomTokenizationTest(TokenizerTesterMixin, unittest.TestCase): from_pretrained_vocab_key = "tokenizer_file" special_tokens_map = {"bos_token": "", "eos_token": "", "unk_token": "", "pad_token": ""} - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() tokenizer = BloomTokenizerFast.from_pretrained("bigscience/tokenizer") - tokenizer.save_pretrained(self.tmpdirname) + tokenizer.save_pretrained(cls.tmpdirname) - def get_rust_tokenizer(self, **kwargs): - kwargs.update(self.special_tokens_map) - return BloomTokenizerFast.from_pretrained(self.tmpdirname, **kwargs) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_rust_tokenizer(cls, pretrained_name=None, **kwargs): + _kwargs = copy.deepcopy(cls.special_tokens_map) + _kwargs.update(kwargs) + kwargs = _kwargs + pretrained_name = pretrained_name or cls.tmpdirname + return BloomTokenizerFast.from_pretrained(pretrained_name, **kwargs) @unittest.skip(reason="This needs a slow tokenizer. Bloom does not have one!") def test_encode_decode_with_spaces(self): @@ -65,7 +74,7 @@ class BloomTokenizationTest(TokenizerTesterMixin, unittest.TestCase): def test_padding(self, max_length=6): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) # tokenizer_r.pad_token = None # Hotfixing padding = None # Simple input s = "This is a simple input" diff --git a/tests/models/byt5/test_tokenization_byt5.py b/tests/models/byt5/test_tokenization_byt5.py index c3075beb506..5024ff3abe5 100644 --- a/tests/models/byt5/test_tokenization_byt5.py +++ b/tests/models/byt5/test_tokenization_byt5.py @@ -19,12 +19,13 @@ import re import shutil import tempfile import unittest +from functools import lru_cache from typing import Tuple from transformers import AddedToken, BatchEncoding, ByT5Tokenizer from transformers.utils import cached_property, is_tf_available, is_torch_available -from ...test_tokenization_common import TokenizerTesterMixin +from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible if is_torch_available(): @@ -39,17 +40,22 @@ class ByT5TokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = ByT5Tokenizer test_rust_tokenizer = False - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() tokenizer = ByT5Tokenizer() - tokenizer.save_pretrained(self.tmpdirname) + tokenizer.save_pretrained(cls.tmpdirname) @cached_property def t5_base_tokenizer(self): return ByT5Tokenizer.from_pretrained("google/byt5-small") - def get_tokenizer(self, **kwargs) -> ByT5Tokenizer: - return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_tokenizer(cls, pretrained_name=None, **kwargs) -> ByT5Tokenizer: + pretrained_name = pretrained_name or cls.tmpdirname + return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs) def get_clean_sequence(self, tokenizer, with_prefix_space=False, max_length=20, min_length=5) -> Tuple[str, list]: # XXX The default common tokenizer tests assume that every ID is decodable on its own. diff --git a/tests/models/camembert/test_tokenization_camembert.py b/tests/models/camembert/test_tokenization_camembert.py index f6613725680..4e46df0edaf 100644 --- a/tests/models/camembert/test_tokenization_camembert.py +++ b/tests/models/camembert/test_tokenization_camembert.py @@ -15,6 +15,7 @@ import tempfile import unittest +from tempfile import TemporaryDirectory from transformers import AddedToken, CamembertTokenizer, CamembertTokenizerFast from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, slow @@ -38,12 +39,13 @@ class CamembertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): test_rust_tokenizer = True test_sentencepiece = True - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() # We have a SentencePiece fixture for testing tokenizer = CamembertTokenizer(SAMPLE_VOCAB) - tokenizer.save_pretrained(self.tmpdirname) + tokenizer.save_pretrained(cls.tmpdirname) @unittest.skip( "Token maps are not equal because someone set the probability of ('NOTUSED', -100), so it's never encoded for fast" @@ -72,8 +74,9 @@ class CamembertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): def test_rust_and_python_bpe_tokenizers(self): tokenizer = CamembertTokenizer(SAMPLE_BPE_VOCAB) - tokenizer.save_pretrained(self.tmpdirname) - rust_tokenizer = CamembertTokenizerFast.from_pretrained(self.tmpdirname) + with TemporaryDirectory() as tmpdirname: + tokenizer.save_pretrained(tmpdirname) + rust_tokenizer = CamembertTokenizerFast.from_pretrained(tmpdirname) sequence = "I was born in 92000, and this is falsé." @@ -147,11 +150,11 @@ class CamembertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): self.assertTrue(all(item in tokenizer.added_tokens_decoder.items() for item in expected.items())) return tokenizer - new_eos = AddedToken("[NEW_EOS]", rstrip=False, lstrip=True, normalized=False) + new_eos = AddedToken("[NEW_EOS]", rstrip=False, lstrip=True, normalized=False, special=True) for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): # Load a slow tokenizer from the hub, init with the new token for fast to also include it - tokenizer = self.tokenizer_class.from_pretrained(pretrained_name, eos_token=new_eos) + tokenizer = self.get_tokenizer(pretrained_name, eos_token=new_eos) EXPECTED_ADDED_TOKENS_DECODER = tokenizer.added_tokens_decoder with self.subTest("Hub -> Slow: Test loading a slow tokenizer from the hub)"): self.assertEqual(tokenizer._special_tokens_map["eos_token"], new_eos) @@ -191,9 +194,7 @@ class CamembertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): with self.subTest("Hub -> Fast: Test loading a fast tokenizer from the hub)"): if self.rust_tokenizer_class is not None: - tokenizer_fast = self.rust_tokenizer_class.from_pretrained( - pretrained_name, eos_token=new_eos, from_slow=True - ) + tokenizer_fast = self.get_rust_tokenizer(pretrained_name, eos_token=new_eos, from_slow=True) self.assertEqual(tokenizer_fast._special_tokens_map["eos_token"], new_eos) self.assertIn(new_eos, list(tokenizer_fast.added_tokens_decoder.values())) # We can't test the following because for BC we kept the default rstrip lstrip in slow not fast. Will comment once normalization is alright diff --git a/tests/models/canine/test_tokenization_canine.py b/tests/models/canine/test_tokenization_canine.py index 13fcbb04999..e2efc99ca99 100644 --- a/tests/models/canine/test_tokenization_canine.py +++ b/tests/models/canine/test_tokenization_canine.py @@ -18,13 +18,14 @@ import os import shutil import tempfile import unittest +from functools import lru_cache from transformers import BatchEncoding, CanineTokenizer from transformers.testing_utils import require_tokenizers, require_torch from transformers.tokenization_utils import AddedToken from transformers.utils import cached_property -from ...test_tokenization_common import TokenizerTesterMixin +from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible class CanineTokenizationTest(TokenizerTesterMixin, unittest.TestCase): @@ -32,17 +33,22 @@ class CanineTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = CanineTokenizer test_rust_tokenizer = False - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() tokenizer = CanineTokenizer() - tokenizer.save_pretrained(self.tmpdirname) + tokenizer.save_pretrained(cls.tmpdirname) @cached_property def canine_tokenizer(self): return CanineTokenizer.from_pretrained("google/canine-s") - def get_tokenizer(self, **kwargs) -> CanineTokenizer: - tokenizer = self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_tokenizer(cls, pretrained_name=None, **kwargs) -> CanineTokenizer: + pretrained_name = pretrained_name or cls.tmpdirname + tokenizer = cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs) tokenizer._unicode_vocab_size = 1024 return tokenizer diff --git a/tests/models/clip/test_tokenization_clip.py b/tests/models/clip/test_tokenization_clip.py index c24f554a078..f0dfec6bd7e 100644 --- a/tests/models/clip/test_tokenization_clip.py +++ b/tests/models/clip/test_tokenization_clip.py @@ -17,12 +17,13 @@ import json import os import unittest +from functools import lru_cache from transformers import CLIPTokenizer, CLIPTokenizerFast from transformers.models.clip.tokenization_clip import VOCAB_FILES_NAMES from transformers.testing_utils import require_ftfy, require_tokenizers -from ...test_tokenization_common import TokenizerTesterMixin +from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible @require_tokenizers @@ -34,28 +35,37 @@ class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase): from_pretrained_kwargs = {} test_seq2seq = False - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "lo", "l", "w", "r", "t", "low", "er", "lowest", "newer", "wider", "", "<|startoftext|>", "<|endoftext|>"] # fmt: skip vocab_tokens = dict(zip(vocab, range(len(vocab)))) merges = ["#version: 0.2", "l o", "lo w", "e r"] - self.special_tokens_map = {"unk_token": ""} + cls.special_tokens_map = {"unk_token": ""} - self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) - self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) - with open(self.vocab_file, "w", encoding="utf-8") as fp: + cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) + with open(cls.vocab_file, "w", encoding="utf-8") as fp: fp.write(json.dumps(vocab_tokens) + "\n") - with open(self.merges_file, "w", encoding="utf-8") as fp: + with open(cls.merges_file, "w", encoding="utf-8") as fp: fp.write("\n".join(merges)) - def get_tokenizer(self, **kwargs): - kwargs.update(self.special_tokens_map) - return CLIPTokenizer.from_pretrained(self.tmpdirname, **kwargs) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_tokenizer(cls, pretrained_name=None, **kwargs): + kwargs.update(cls.special_tokens_map) + pretrained_name = pretrained_name or cls.tmpdirname + return CLIPTokenizer.from_pretrained(pretrained_name, **kwargs) - def get_rust_tokenizer(self, **kwargs): - kwargs.update(self.special_tokens_map) - return CLIPTokenizerFast.from_pretrained(self.tmpdirname, **kwargs) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_rust_tokenizer(cls, pretrained_name=None, **kwargs): + kwargs.update(cls.special_tokens_map) + pretrained_name = pretrained_name or cls.tmpdirname + return CLIPTokenizerFast.from_pretrained(pretrained_name, **kwargs) def get_input_output_texts(self, tokenizer): input_text = "lower newer" @@ -77,8 +87,8 @@ class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase): def test_check_encoding_slow_fast(self): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_s = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_s = self.get_tokenizer(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) text = "A\n'll 11p223RF☆ho!!to?'d'd''d of a cat to-$''d." text_tokenized_s = tokenizer_s.tokenize(text) @@ -138,7 +148,7 @@ class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase): text_of_1_token = "hello" # `hello` is a token in the vocabulary of `pretrained_name` text = f"{text_of_1_token} {text_of_1_token}" - tokenizer_r = self.rust_tokenizer_class.from_pretrained( + tokenizer_r = self.get_rust_tokenizer( pretrained_name, use_fast=True, ) @@ -151,7 +161,7 @@ class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase): text = f" {text}" - tokenizer_r = self.rust_tokenizer_class.from_pretrained( + tokenizer_r = self.get_rust_tokenizer( pretrained_name, use_fast=True, ) @@ -166,7 +176,7 @@ class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase): # Test related to the breaking change introduced in transformers v4.17.0 # We need to check that an error in raised when the user try to load a previous version of the tokenizer. with self.assertRaises(ValueError) as context: - self.rust_tokenizer_class.from_pretrained("robot-test/old-clip-tokenizer") + self.get_rust_tokenizer("robot-test/old-clip-tokenizer") self.assertTrue( context.exception.args[0].startswith( diff --git a/tests/models/clvp/test_tokenization_clvp.py b/tests/models/clvp/test_tokenization_clvp.py index aa8d2d22a5b..1c526e84d14 100644 --- a/tests/models/clvp/test_tokenization_clvp.py +++ b/tests/models/clvp/test_tokenization_clvp.py @@ -17,11 +17,12 @@ import json import os import unittest +from functools import lru_cache from typing import List from transformers import ClvpTokenizer -from ...test_tokenization_common import TokenizerTesterMixin, slow +from ...test_tokenization_common import TokenizerTesterMixin, slow, use_cache_if_possible class ClvpTokenizationTest(TokenizerTesterMixin, unittest.TestCase): @@ -32,8 +33,9 @@ class ClvpTokenizationTest(TokenizerTesterMixin, unittest.TestCase): test_seq2seq = False test_sentencepiece_ignore_case = True - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt vocab = [ @@ -62,19 +64,23 @@ class ClvpTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ] vocab_tokens = dict(zip(vocab, range(len(vocab)))) merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""] - self.special_tokens_map = {"unk_token": ""} + cls.special_tokens_map = {"unk_token": ""} - self.vocab_file = os.path.join(self.tmpdirname, "vocab.json") - self.merges_file = os.path.join(self.tmpdirname, "merges.txt") - with open(self.vocab_file, "w", encoding="utf-8") as fp: + cls.vocab_file = os.path.join(cls.tmpdirname, "vocab.json") + cls.merges_file = os.path.join(cls.tmpdirname, "merges.txt") + with open(cls.vocab_file, "w", encoding="utf-8") as fp: fp.write(json.dumps(vocab_tokens) + "\n") - with open(self.merges_file, "w", encoding="utf-8") as fp: + with open(cls.merges_file, "w", encoding="utf-8") as fp: fp.write("\n".join(merges)) # Copied from transformers.tests.models.gpt2.test_tokenization_gpt2.GPT2TokenizationTest.get_tokenizer with GPT2->Clvp - def get_tokenizer(self, **kwargs): - kwargs.update(self.special_tokens_map) - return ClvpTokenizer.from_pretrained(self.tmpdirname, **kwargs) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_tokenizer(cls, pretrained_name=None, **kwargs): + kwargs.update(cls.special_tokens_map) + pretrained_name = pretrained_name or cls.tmpdirname + return ClvpTokenizer.from_pretrained(pretrained_name, **kwargs) # Copied from transformers.tests.models.gpt2.test_tokenization_gpt2.GPT2TokenizationTest.get_input_output_texts def get_input_output_texts(self, tokenizer): @@ -134,7 +140,7 @@ class ClvpTokenizationTest(TokenizerTesterMixin, unittest.TestCase): def test_padding(self, max_length=15): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) # Simple input s = "This is a simple input" diff --git a/tests/models/code_llama/test_tokenization_code_llama.py b/tests/models/code_llama/test_tokenization_code_llama.py index ee07c54c16a..774c17f5130 100644 --- a/tests/models/code_llama/test_tokenization_code_llama.py +++ b/tests/models/code_llama/test_tokenization_code_llama.py @@ -53,15 +53,16 @@ class CodeLlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): test_sentencepiece = True from_pretrained_kwargs = {} - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() # We have a SentencePiece fixture for testing tokenizer = CodeLlamaTokenizer(SAMPLE_VOCAB, keep_accents=True) tokenizer.pad_token = tokenizer.eos_token - tokenizer.save_pretrained(self.tmpdirname) + tokenizer.save_pretrained(cls.tmpdirname) - def get_tokenizers(self, **kwargs): + def get_tokenizers(cls, **kwargs): kwargs.update({"pad_token": ""}) return super().get_tokenizers(**kwargs) @@ -151,8 +152,8 @@ class CodeLlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ] for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) tmpdirname2 = tempfile.mkdtemp() @@ -255,7 +256,7 @@ class CodeLlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): added_tokens = [AddedToken("", lstrip=True)] - tokenizer_r = self.rust_tokenizer_class.from_pretrained( + tokenizer_r = self.get_rust_tokenizer( pretrained_name, additional_special_tokens=added_tokens, **kwargs ) r_output = tokenizer_r.encode("Hey this is a token") @@ -265,7 +266,7 @@ class CodeLlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): self.assertTrue(special_token_id in r_output) if self.test_slow_tokenizer: - tokenizer_cr = self.rust_tokenizer_class.from_pretrained( + tokenizer_cr = self.get_rust_tokenizer( pretrained_name, additional_special_tokens=added_tokens, **kwargs, # , from_slow=True <- unfortunately too slow to convert diff --git a/tests/models/codegen/test_tokenization_codegen.py b/tests/models/codegen/test_tokenization_codegen.py index 184c7521629..28d388202b8 100644 --- a/tests/models/codegen/test_tokenization_codegen.py +++ b/tests/models/codegen/test_tokenization_codegen.py @@ -18,12 +18,13 @@ import json import os import re import unittest +from functools import lru_cache from transformers import CodeGenTokenizer, CodeGenTokenizerFast from transformers.models.codegen.tokenization_codegen import VOCAB_FILES_NAMES from transformers.testing_utils import require_tokenizers, slow -from ...test_tokenization_common import TokenizerTesterMixin +from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible @require_tokenizers @@ -35,8 +36,9 @@ class CodeGenTokenizationTest(TokenizerTesterMixin, unittest.TestCase): from_pretrained_kwargs = {"add_prefix_space": True} test_seq2seq = False - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt vocab = [ @@ -64,22 +66,30 @@ class CodeGenTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ] vocab_tokens = dict(zip(vocab, range(len(vocab)))) merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""] - self.special_tokens_map = {"unk_token": ""} + cls.special_tokens_map = {"unk_token": ""} - self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) - self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) - with open(self.vocab_file, "w", encoding="utf-8") as fp: + cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) + with open(cls.vocab_file, "w", encoding="utf-8") as fp: fp.write(json.dumps(vocab_tokens) + "\n") - with open(self.merges_file, "w", encoding="utf-8") as fp: + with open(cls.merges_file, "w", encoding="utf-8") as fp: fp.write("\n".join(merges)) - def get_tokenizer(self, **kwargs): - kwargs.update(self.special_tokens_map) - return CodeGenTokenizer.from_pretrained(self.tmpdirname, **kwargs) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_tokenizer(cls, pretrained_name=None, **kwargs): + kwargs.update(cls.special_tokens_map) + pretrained_name = pretrained_name or cls.tmpdirname + return CodeGenTokenizer.from_pretrained(pretrained_name, **kwargs) - def get_rust_tokenizer(self, **kwargs): - kwargs.update(self.special_tokens_map) - return CodeGenTokenizerFast.from_pretrained(self.tmpdirname, **kwargs) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_rust_tokenizer(cls, pretrained_name=None, **kwargs): + kwargs.update(cls.special_tokens_map) + pretrained_name = pretrained_name or cls.tmpdirname + return CodeGenTokenizerFast.from_pretrained(pretrained_name, **kwargs) def get_input_output_texts(self, tokenizer): input_text = "lower newer" @@ -136,7 +146,7 @@ class CodeGenTokenizationTest(TokenizerTesterMixin, unittest.TestCase): def test_padding(self, max_length=15): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) # Simple input s = "This is a simple input" diff --git a/tests/models/cohere/test_tokenization_cohere.py b/tests/models/cohere/test_tokenization_cohere.py index a8ab85fe3b8..cec1334a335 100644 --- a/tests/models/cohere/test_tokenization_cohere.py +++ b/tests/models/cohere/test_tokenization_cohere.py @@ -13,12 +13,14 @@ # See the License for the specific language governing permissions and # limitations under the License. +import copy import unittest +from functools import lru_cache from transformers import CohereTokenizerFast from transformers.testing_utils import require_jinja, require_tokenizers, require_torch_multi_gpu -from ...test_tokenization_common import TokenizerTesterMixin +from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible @require_tokenizers @@ -37,14 +39,21 @@ class CohereTokenizationTest(TokenizerTesterMixin, unittest.TestCase): "pad_token": "", } - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() tokenizer = CohereTokenizerFast.from_pretrained("hf-internal-testing/tiny-random-CohereForCausalLM") - tokenizer.save_pretrained(self.tmpdirname) + tokenizer.save_pretrained(cls.tmpdirname) - def get_rust_tokenizer(self, **kwargs): - kwargs.update(self.special_tokens_map) - return CohereTokenizerFast.from_pretrained(self.tmpdirname, **kwargs) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_rust_tokenizer(cls, pretrained_name=None, **kwargs): + _kwargs = copy.deepcopy(cls.special_tokens_map) + _kwargs.update(kwargs) + kwargs = _kwargs + pretrained_name = pretrained_name or cls.tmpdirname + return CohereTokenizerFast.from_pretrained(pretrained_name, **kwargs) # This gives CPU OOM on a single-gpu runner (~60G RAM). On multi-gpu runner, it has ~180G RAM which is enough. @require_torch_multi_gpu @@ -80,7 +89,7 @@ class CohereTokenizationTest(TokenizerTesterMixin, unittest.TestCase): def test_padding(self, max_length=10): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) # tokenizer_r.pad_token = None # Hotfixing padding = None # Simple input s = "This is a simple input" diff --git a/tests/models/cpmant/test_tokenization_cpmant.py b/tests/models/cpmant/test_tokenization_cpmant.py index 042473065be..32449763eae 100644 --- a/tests/models/cpmant/test_tokenization_cpmant.py +++ b/tests/models/cpmant/test_tokenization_cpmant.py @@ -28,8 +28,9 @@ class CPMAntTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = CpmAntTokenizer test_rust_tokenizer = False - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() vocab_tokens = [ "", @@ -49,8 +50,8 @@ class CPMAntTokenizationTest(TokenizerTesterMixin, unittest.TestCase): "n", "t", ] - self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) - with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: + cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer: vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) @tooslow diff --git a/tests/models/ctrl/test_tokenization_ctrl.py b/tests/models/ctrl/test_tokenization_ctrl.py index 7fe61f36074..e22ca8abe59 100644 --- a/tests/models/ctrl/test_tokenization_ctrl.py +++ b/tests/models/ctrl/test_tokenization_ctrl.py @@ -16,10 +16,11 @@ import json import os import unittest +from functools import lru_cache from transformers.models.ctrl.tokenization_ctrl import VOCAB_FILES_NAMES, CTRLTokenizer -from ...test_tokenization_common import TokenizerTesterMixin +from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible class CTRLTokenizationTest(TokenizerTesterMixin, unittest.TestCase): @@ -28,25 +29,30 @@ class CTRLTokenizationTest(TokenizerTesterMixin, unittest.TestCase): test_rust_tokenizer = False test_seq2seq = False - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt vocab = ["adapt", "re@@", "a@@", "apt", "c@@", "t", ""] vocab_tokens = dict(zip(vocab, range(len(vocab)))) merges = ["#version: 0.2", "a p", "ap t", "r e", "a d", "ad apt", ""] - self.special_tokens_map = {"unk_token": ""} + cls.special_tokens_map = {"unk_token": ""} - self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) - self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) - with open(self.vocab_file, "w", encoding="utf-8") as fp: + cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) + with open(cls.vocab_file, "w", encoding="utf-8") as fp: fp.write(json.dumps(vocab_tokens) + "\n") - with open(self.merges_file, "w", encoding="utf-8") as fp: + with open(cls.merges_file, "w", encoding="utf-8") as fp: fp.write("\n".join(merges)) - def get_tokenizer(self, **kwargs): - kwargs.update(self.special_tokens_map) - return CTRLTokenizer.from_pretrained(self.tmpdirname, **kwargs) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_tokenizer(cls, pretrained_name=None, **kwargs): + kwargs.update(cls.special_tokens_map) + pretrained_name = pretrained_name or cls.tmpdirname + return CTRLTokenizer.from_pretrained(pretrained_name, **kwargs) def get_input_output_texts(self, tokenizer): input_text = "adapt react readapt apt" diff --git a/tests/models/deberta/test_tokenization_deberta.py b/tests/models/deberta/test_tokenization_deberta.py index 96248cf2ec1..dc3c84c8713 100644 --- a/tests/models/deberta/test_tokenization_deberta.py +++ b/tests/models/deberta/test_tokenization_deberta.py @@ -17,12 +17,13 @@ import json import os import unittest +from functools import lru_cache from transformers import DebertaTokenizer, DebertaTokenizerFast from transformers.models.deberta.tokenization_deberta import VOCAB_FILES_NAMES from transformers.testing_utils import slow -from ...test_tokenization_common import TokenizerTesterMixin +from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible class DebertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): @@ -31,8 +32,9 @@ class DebertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): test_rust_tokenizer = True rust_tokenizer_class = DebertaTokenizerFast - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt vocab = [ @@ -59,18 +61,22 @@ class DebertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ] vocab_tokens = dict(zip(vocab, range(len(vocab)))) merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""] - self.special_tokens_map = {"unk_token": "[UNK]"} + cls.special_tokens_map = {"unk_token": "[UNK]"} - self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) - self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) - with open(self.vocab_file, "w", encoding="utf-8") as fp: + cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) + with open(cls.vocab_file, "w", encoding="utf-8") as fp: fp.write(json.dumps(vocab_tokens) + "\n") - with open(self.merges_file, "w", encoding="utf-8") as fp: + with open(cls.merges_file, "w", encoding="utf-8") as fp: fp.write("\n".join(merges)) - def get_tokenizer(self, **kwargs): - kwargs.update(self.special_tokens_map) - return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_tokenizer(cls, pretrained_name=None, **kwargs): + kwargs.update(cls.special_tokens_map) + pretrained_name = pretrained_name or cls.tmpdirname + return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs) def get_input_output_texts(self, tokenizer): input_text = "lower newer" diff --git a/tests/models/deberta_v2/test_tokenization_deberta_v2.py b/tests/models/deberta_v2/test_tokenization_deberta_v2.py index da59fa28292..c2e57a58090 100644 --- a/tests/models/deberta_v2/test_tokenization_deberta_v2.py +++ b/tests/models/deberta_v2/test_tokenization_deberta_v2.py @@ -33,12 +33,13 @@ class DebertaV2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): test_sentencepiece = True test_sentencepiece_ignore_case = True - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() # We have a SentencePiece fixture for testing tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="") - tokenizer.save_pretrained(self.tmpdirname) + tokenizer.save_pretrained(cls.tmpdirname) def get_input_output_texts(self, tokenizer): input_text = "this is a test" diff --git a/tests/models/distilbert/test_tokenization_distilbert.py b/tests/models/distilbert/test_tokenization_distilbert.py index c61393f6a6a..42f6d6a4ad1 100644 --- a/tests/models/distilbert/test_tokenization_distilbert.py +++ b/tests/models/distilbert/test_tokenization_distilbert.py @@ -17,11 +17,11 @@ from transformers import DistilBertTokenizer, DistilBertTokenizerFast from transformers.testing_utils import require_tokenizers, slow -from ..bert.test_tokenization_bert import BertTokenizationTest +from ..bert import test_tokenization_bert @require_tokenizers -class DistilBertTokenizationTest(BertTokenizationTest): +class DistilBertTokenizationTest(test_tokenization_bert.BertTokenizationTest): tokenizer_class = DistilBertTokenizer rust_tokenizer_class = DistilBertTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/dpr/test_tokenization_dpr.py b/tests/models/dpr/test_tokenization_dpr.py index 1fd3d8bdb9e..28c5562ec8d 100644 --- a/tests/models/dpr/test_tokenization_dpr.py +++ b/tests/models/dpr/test_tokenization_dpr.py @@ -25,11 +25,11 @@ from transformers import ( from transformers.testing_utils import require_tokenizers, slow from transformers.tokenization_utils_base import BatchEncoding -from ..bert.test_tokenization_bert import BertTokenizationTest +from ..bert import test_tokenization_bert @require_tokenizers -class DPRContextEncoderTokenizationTest(BertTokenizationTest): +class DPRContextEncoderTokenizationTest(test_tokenization_bert.BertTokenizationTest): tokenizer_class = DPRContextEncoderTokenizer rust_tokenizer_class = DPRContextEncoderTokenizerFast test_rust_tokenizer = True @@ -37,7 +37,7 @@ class DPRContextEncoderTokenizationTest(BertTokenizationTest): @require_tokenizers -class DPRQuestionEncoderTokenizationTest(BertTokenizationTest): +class DPRQuestionEncoderTokenizationTest(test_tokenization_bert.BertTokenizationTest): tokenizer_class = DPRQuestionEncoderTokenizer rust_tokenizer_class = DPRQuestionEncoderTokenizerFast test_rust_tokenizer = True @@ -45,7 +45,7 @@ class DPRQuestionEncoderTokenizationTest(BertTokenizationTest): @require_tokenizers -class DPRReaderTokenizationTest(BertTokenizationTest): +class DPRReaderTokenizationTest(test_tokenization_bert.BertTokenizationTest): tokenizer_class = DPRReaderTokenizer rust_tokenizer_class = DPRReaderTokenizerFast test_rust_tokenizer = True diff --git a/tests/models/electra/test_tokenization_electra.py b/tests/models/electra/test_tokenization_electra.py index 2a9c47b93c2..0155c21bf28 100644 --- a/tests/models/electra/test_tokenization_electra.py +++ b/tests/models/electra/test_tokenization_electra.py @@ -40,8 +40,9 @@ class ElectraTokenizationTest(TokenizerTesterMixin, unittest.TestCase): space_between_special_tokens = True from_pretrained_filter = filter_non_english - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() vocab_tokens = [ "[UNK]", @@ -60,8 +61,8 @@ class ElectraTokenizationTest(TokenizerTesterMixin, unittest.TestCase): "low", "lowest", ] - self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) - with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: + cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer: vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) def get_input_output_texts(self, tokenizer): @@ -250,7 +251,7 @@ class ElectraTokenizationTest(TokenizerTesterMixin, unittest.TestCase): def test_offsets_with_special_characters(self): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence." tokens = tokenizer_r.encode_plus( @@ -305,8 +306,8 @@ class ElectraTokenizationTest(TokenizerTesterMixin, unittest.TestCase): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): kwargs["tokenize_chinese_chars"] = True - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False) ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False) @@ -319,8 +320,8 @@ class ElectraTokenizationTest(TokenizerTesterMixin, unittest.TestCase): self.assertListEqual(tokens_without_spe_char_r, list_of_commun_chinese_char) kwargs["tokenize_chinese_chars"] = False - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False) ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False) diff --git a/tests/models/esm/test_tokenization_esm.py b/tests/models/esm/test_tokenization_esm.py index aac03b535ed..e0013d8a189 100644 --- a/tests/models/esm/test_tokenization_esm.py +++ b/tests/models/esm/test_tokenization_esm.py @@ -17,6 +17,7 @@ import os import tempfile import unittest +from functools import lru_cache from typing import List from transformers.models.esm.tokenization_esm import VOCAB_FILES_NAMES, EsmTokenizer @@ -24,24 +25,32 @@ from transformers.testing_utils import require_tokenizers from transformers.tokenization_utils import PreTrainedTokenizer from transformers.tokenization_utils_base import PreTrainedTokenizerBase +from ...test_tokenization_common import use_cache_if_possible + @require_tokenizers class ESMTokenizationTest(unittest.TestCase): tokenizer_class = EsmTokenizer - def setUp(self): - super().setUp() - self.tmpdirname = tempfile.mkdtemp() + @classmethod + def setUpClass(cls): + super().setUpClass() + + cls.tmpdirname = tempfile.mkdtemp() vocab_tokens: List[str] = ["", "", "", "", "L", "A", "G", "V", "S", "E", "R", "T", "I", "D", "P", "K", "Q", "N", "F", "Y", "M", "H", "W", "C", "X", "B", "U", "Z", "O", ".", "-", "", ""] # fmt: skip - self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) - with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: + cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer: vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) - def get_tokenizers(self, **kwargs) -> List[PreTrainedTokenizerBase]: - return [self.get_tokenizer(**kwargs)] + def get_tokenizers(cls, **kwargs) -> List[PreTrainedTokenizerBase]: + return [cls.get_tokenizer(**kwargs)] - def get_tokenizer(self, **kwargs) -> PreTrainedTokenizer: - return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_tokenizer(cls, pretrained_name=None, **kwargs) -> PreTrainedTokenizer: + pretrained_name = pretrained_name or cls.tmpdirname + return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs) def test_tokenizer_single_example(self): tokenizer = self.tokenizer_class(self.vocab_file) diff --git a/tests/models/fastspeech2_conformer/test_tokenization_fastspeech2_conformer.py b/tests/models/fastspeech2_conformer/test_tokenization_fastspeech2_conformer.py index 72acb83999b..23c8a35dc65 100644 --- a/tests/models/fastspeech2_conformer/test_tokenization_fastspeech2_conformer.py +++ b/tests/models/fastspeech2_conformer/test_tokenization_fastspeech2_conformer.py @@ -28,10 +28,11 @@ class FastSpeech2ConformerTokenizerTest(TokenizerTesterMixin, unittest.TestCase) tokenizer_class = FastSpeech2ConformerTokenizer test_rust_tokenizer = False - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() tokenizer = FastSpeech2ConformerTokenizer.from_pretrained("espnet/fastspeech2_conformer") - tokenizer.save_pretrained(self.tmpdirname) + tokenizer.save_pretrained(cls.tmpdirname) def get_input_output_texts(self, tokenizer): input_text = "this is a test" diff --git a/tests/models/flaubert/test_tokenization_flaubert.py b/tests/models/flaubert/test_tokenization_flaubert.py index 6a90de030d2..0fd42da306e 100644 --- a/tests/models/flaubert/test_tokenization_flaubert.py +++ b/tests/models/flaubert/test_tokenization_flaubert.py @@ -30,8 +30,9 @@ class FlaubertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = FlaubertTokenizer test_rust_tokenizer = False - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "w", "r", "t", "i", "lo", "low", "ne", "new", "er", "low", "lowest", "new", "newer", "wider", ""] # fmt: skip @@ -39,11 +40,11 @@ class FlaubertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): vocab_tokens = dict(zip(vocab, range(len(vocab)))) merges = ["n e 300", "ne w 301", "e r 302", ""] - self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) - self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) - with open(self.vocab_file, "w", encoding="utf-8") as fp: + cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) + with open(cls.vocab_file, "w", encoding="utf-8") as fp: fp.write(json.dumps(vocab_tokens) + "\n") - with open(self.merges_file, "w", encoding="utf-8") as fp: + with open(cls.merges_file, "w", encoding="utf-8") as fp: fp.write("\n".join(merges)) # Copied from transformers.tests.models.xlm.test_tokenization_xlm.XLMTokenizationTest.test_full_tokenizer diff --git a/tests/models/fnet/test_tokenization_fnet.py b/tests/models/fnet/test_tokenization_fnet.py index a55c142b25d..c8156a7b208 100644 --- a/tests/models/fnet/test_tokenization_fnet.py +++ b/tests/models/fnet/test_tokenization_fnet.py @@ -36,12 +36,13 @@ class FNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase): test_sentencepiece_ignore_case = True test_seq2seq = False - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() # We have a SentencePiece fixture for testing tokenizer = FNetTokenizer(SAMPLE_VOCAB) - tokenizer.save_pretrained(self.tmpdirname) + tokenizer.save_pretrained(cls.tmpdirname) def get_input_output_texts(self, tokenizer): input_text = "this is a test" @@ -147,7 +148,7 @@ class FNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): added_tokens = [AddedToken("", lstrip=True)] - tokenizer_r = self.rust_tokenizer_class.from_pretrained( + tokenizer_r = self.get_rust_tokenizer( pretrained_name, additional_special_tokens=added_tokens, **kwargs ) r_output = tokenizer_r.encode("Hey this is a token") @@ -175,7 +176,7 @@ class FNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): added_tokens = [AddedToken("", lstrip=True)] - tokenizer_r = self.rust_tokenizer_class.from_pretrained( + tokenizer_r = self.get_rust_tokenizer( pretrained_name, additional_special_tokens=added_tokens, **kwargs, from_slow=True ) special_token_id = tokenizer_r.encode("", add_special_tokens=False)[0] @@ -198,8 +199,8 @@ class FNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id) pad_token_id = tokenizer_p.pad_token_id diff --git a/tests/models/fsmt/test_tokenization_fsmt.py b/tests/models/fsmt/test_tokenization_fsmt.py index bac487767ba..cbc96922554 100644 --- a/tests/models/fsmt/test_tokenization_fsmt.py +++ b/tests/models/fsmt/test_tokenization_fsmt.py @@ -34,8 +34,9 @@ class FSMTTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = FSMTTokenizer test_rust_tokenizer = False - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt vocab = [ @@ -64,22 +65,22 @@ class FSMTTokenizationTest(TokenizerTesterMixin, unittest.TestCase): vocab_tokens = dict(zip(vocab, range(len(vocab)))) merges = ["l o 123", "lo w 1456", "e r 1789", ""] - self.langs = ["en", "ru"] + cls.langs = ["en", "ru"] config = { - "langs": self.langs, + "langs": cls.langs, "src_vocab_size": 10, "tgt_vocab_size": 20, } - self.src_vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["src_vocab_file"]) - self.tgt_vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["tgt_vocab_file"]) - config_file = os.path.join(self.tmpdirname, "tokenizer_config.json") - self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) - with open(self.src_vocab_file, "w") as fp: + cls.src_vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["src_vocab_file"]) + cls.tgt_vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["tgt_vocab_file"]) + config_file = os.path.join(cls.tmpdirname, "tokenizer_config.json") + cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) + with open(cls.src_vocab_file, "w") as fp: fp.write(json.dumps(vocab_tokens)) - with open(self.tgt_vocab_file, "w") as fp: + with open(cls.tgt_vocab_file, "w") as fp: fp.write(json.dumps(vocab_tokens)) - with open(self.merges_file, "w") as fp: + with open(cls.merges_file, "w") as fp: fp.write("\n".join(merges)) with open(config_file, "w") as fp: fp.write(json.dumps(config)) diff --git a/tests/models/funnel/test_tokenization_funnel.py b/tests/models/funnel/test_tokenization_funnel.py index 9ddb3b325d0..d5e22a3c6ed 100644 --- a/tests/models/funnel/test_tokenization_funnel.py +++ b/tests/models/funnel/test_tokenization_funnel.py @@ -16,12 +16,13 @@ import os import unittest +from functools import lru_cache from transformers import FunnelTokenizer, FunnelTokenizerFast from transformers.models.funnel.tokenization_funnel import VOCAB_FILES_NAMES from transformers.testing_utils import require_tokenizers -from ...test_tokenization_common import TokenizerTesterMixin +from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible @require_tokenizers @@ -32,8 +33,9 @@ class FunnelTokenizationTest(TokenizerTesterMixin, unittest.TestCase): test_rust_tokenizer = True space_between_special_tokens = True - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() vocab_tokens = [ "", @@ -50,15 +52,23 @@ class FunnelTokenizationTest(TokenizerTesterMixin, unittest.TestCase): "low", "lowest", ] - self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) - with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: + cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer: vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) - def get_tokenizer(self, **kwargs): - return FunnelTokenizer.from_pretrained(self.tmpdirname, **kwargs) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_tokenizer(cls, pretrained_name=None, **kwargs): + pretrained_name = pretrained_name or cls.tmpdirname + return FunnelTokenizer.from_pretrained(pretrained_name, **kwargs) - def get_rust_tokenizer(self, **kwargs): - return FunnelTokenizerFast.from_pretrained(self.tmpdirname, **kwargs) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_rust_tokenizer(cls, pretrained_name=None, **kwargs): + pretrained_name = pretrained_name or cls.tmpdirname + return FunnelTokenizerFast.from_pretrained(pretrained_name, **kwargs) def get_input_output_texts(self, tokenizer): input_text = "UNwant\u00e9d,running" diff --git a/tests/models/gemma/test_tokenization_gemma.py b/tests/models/gemma/test_tokenization_gemma.py index 0b43fec8055..e48d19a2534 100644 --- a/tests/models/gemma/test_tokenization_gemma.py +++ b/tests/models/gemma/test_tokenization_gemma.py @@ -53,12 +53,13 @@ class GemmaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): test_sentencepiece = True from_pretrained_kwargs = {} - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() # We have a SentencePiece fixture for testing tokenizer = GemmaTokenizer(SAMPLE_VOCAB, keep_accents=True) tokenizer.pad_token = tokenizer.eos_token - tokenizer.save_pretrained(self.tmpdirname) + tokenizer.save_pretrained(cls.tmpdirname) @require_torch def test_batch_tokenization(self): @@ -103,7 +104,7 @@ class GemmaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): added_tokens = [AddedToken("", lstrip=True)] - tokenizer_r = self.rust_tokenizer_class.from_pretrained( + tokenizer_r = self.get_rust_tokenizer( pretrained_name, additional_special_tokens=added_tokens, **kwargs ) r_output = tokenizer_r.encode("Hey this is a token") @@ -113,7 +114,7 @@ class GemmaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): self.assertTrue(special_token_id in r_output) if self.test_slow_tokenizer: - tokenizer_cr = self.rust_tokenizer_class.from_pretrained( + tokenizer_cr = self.get_rust_tokenizer( pretrained_name, additional_special_tokens=added_tokens, **kwargs, # , from_slow=True <- unfortunately too slow to convert diff --git a/tests/models/gpt2/test_tokenization_gpt2.py b/tests/models/gpt2/test_tokenization_gpt2.py index 5d63061dafb..40e9f2fe48e 100644 --- a/tests/models/gpt2/test_tokenization_gpt2.py +++ b/tests/models/gpt2/test_tokenization_gpt2.py @@ -17,12 +17,13 @@ import json import os import unittest +from functools import lru_cache from transformers import AutoTokenizer, GPT2Tokenizer, GPT2TokenizerFast from transformers.models.gpt2.tokenization_gpt2 import VOCAB_FILES_NAMES from transformers.testing_utils import require_jinja, require_tiktoken, require_tokenizers -from ...test_tokenization_common import TokenizerTesterMixin +from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible @require_tokenizers @@ -34,8 +35,9 @@ class GPT2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): from_pretrained_kwargs = {"add_prefix_space": True} test_seq2seq = False - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt vocab = [ @@ -63,22 +65,30 @@ class GPT2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): ] vocab_tokens = dict(zip(vocab, range(len(vocab)))) merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""] - self.special_tokens_map = {"unk_token": ""} + cls.special_tokens_map = {"unk_token": ""} - self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) - self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) - with open(self.vocab_file, "w", encoding="utf-8") as fp: + cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) + with open(cls.vocab_file, "w", encoding="utf-8") as fp: fp.write(json.dumps(vocab_tokens) + "\n") - with open(self.merges_file, "w", encoding="utf-8") as fp: + with open(cls.merges_file, "w", encoding="utf-8") as fp: fp.write("\n".join(merges)) - def get_tokenizer(self, **kwargs): - kwargs.update(self.special_tokens_map) - return GPT2Tokenizer.from_pretrained(self.tmpdirname, **kwargs) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_tokenizer(cls, pretrained_name=None, **kwargs): + kwargs.update(cls.special_tokens_map) + pretrained_name = pretrained_name or cls.tmpdirname + return GPT2Tokenizer.from_pretrained(pretrained_name, **kwargs) - def get_rust_tokenizer(self, **kwargs): - kwargs.update(self.special_tokens_map) - return GPT2TokenizerFast.from_pretrained(self.tmpdirname, **kwargs) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_rust_tokenizer(cls, pretrained_name=None, **kwargs): + kwargs.update(cls.special_tokens_map) + pretrained_name = pretrained_name or cls.tmpdirname + return GPT2TokenizerFast.from_pretrained(pretrained_name, **kwargs) def get_input_output_texts(self, tokenizer): input_text = "lower newer" @@ -135,7 +145,7 @@ class GPT2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): def test_padding(self, max_length=15): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) # Simple input s = "This is a simple input" diff --git a/tests/models/gpt_neox_japanese/test_tokenization_gpt_neox_japanese.py b/tests/models/gpt_neox_japanese/test_tokenization_gpt_neox_japanese.py index 029c8b99d44..6402c579560 100644 --- a/tests/models/gpt_neox_japanese/test_tokenization_gpt_neox_japanese.py +++ b/tests/models/gpt_neox_japanese/test_tokenization_gpt_neox_japanese.py @@ -17,6 +17,7 @@ import json import os import unittest +from functools import lru_cache from transformers.models.gpt_neox_japanese.tokenization_gpt_neox_japanese import ( VOCAB_FILES_NAMES, @@ -24,7 +25,7 @@ from transformers.models.gpt_neox_japanese.tokenization_gpt_neox_japanese import ) from transformers.testing_utils import require_tokenizers, slow -from ...test_tokenization_common import TokenizerTesterMixin +from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible @require_tokenizers @@ -34,8 +35,9 @@ class GPTNeoXJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase): test_rust_tokenizer = False from_pretrained_kwargs = {"do_clean_text": False, "add_prefix_space": False} - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() vocab_tokens = [ "こん", @@ -62,18 +64,22 @@ class GPTNeoXJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase): "<|endoftext|>", ] emoji_tokens = {"emoji": {"\ud83d\ude00": "<|emoji1|>"}, "emoji_inv": {"<|emoji1|>": "\ud83d\ude00"}} # 😀 - self.special_tokens_map = {"unk_token": ""} + cls.special_tokens_map = {"unk_token": ""} - self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) - self.emoji_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["emoji_file"]) - with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: + cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + cls.emoji_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["emoji_file"]) + with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer: vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) - with open(self.emoji_file, "w") as emoji_writer: + with open(cls.emoji_file, "w") as emoji_writer: emoji_writer.write(json.dumps(emoji_tokens)) - def get_tokenizer(self, **kwargs): - kwargs.update(self.special_tokens_map) - return GPTNeoXJapaneseTokenizer.from_pretrained(self.tmpdirname, **kwargs) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_tokenizer(cls, pretrained_name=None, **kwargs): + kwargs.update(cls.special_tokens_map) + pretrained_name = pretrained_name or cls.tmpdirname + return GPTNeoXJapaneseTokenizer.from_pretrained(pretrained_name, **kwargs) def get_input_output_texts(self, tokenizer): input_text = "こんにちは、世界。 \nこんばんは、㔺界。😀" diff --git a/tests/models/gpt_sw3/test_tokenization_gpt_sw3.py b/tests/models/gpt_sw3/test_tokenization_gpt_sw3.py index eb5de3a6c20..a13be778a6e 100644 --- a/tests/models/gpt_sw3/test_tokenization_gpt_sw3.py +++ b/tests/models/gpt_sw3/test_tokenization_gpt_sw3.py @@ -33,13 +33,14 @@ class GPTSw3TokenizationTest(TokenizerTesterMixin, unittest.TestCase): test_sentencepiece = True test_sentencepiece_ignore_case = False - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() # We have a SentencePiece fixture for testing tokenizer = GPTSw3Tokenizer(SAMPLE_VOCAB, eos_token="", bos_token="", pad_token="") - tokenizer.save_pretrained(self.tmpdirname) + tokenizer.save_pretrained(cls.tmpdirname) def get_input_output_texts(self, tokenizer): input_text = "This is a test" diff --git a/tests/models/herbert/test_tokenization_herbert.py b/tests/models/herbert/test_tokenization_herbert.py index 02b2c54a2f0..36849bb6983 100644 --- a/tests/models/herbert/test_tokenization_herbert.py +++ b/tests/models/herbert/test_tokenization_herbert.py @@ -33,12 +33,13 @@ class HerbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): rust_tokenizer_class = HerbertTokenizerFast test_rust_tokenizer = True - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() # Use a simpler test file without japanese/chinese characters with open(f"{get_tests_dir()}/fixtures/sample_text_no_unicode.txt", encoding="utf-8") as f_data: - self._data = f_data.read().replace("\n\n", "\n").strip() + cls._data = f_data.read().replace("\n\n", "\n").strip() vocab = [ "", @@ -69,11 +70,11 @@ class HerbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): vocab_tokens = dict(zip(vocab, range(len(vocab)))) merges = ["l o 123", "lo w 1456", "e r 1789", ""] - self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) - self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) - with open(self.vocab_file, "w") as fp: + cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) + with open(cls.vocab_file, "w") as fp: fp.write(json.dumps(vocab_tokens)) - with open(self.merges_file, "w") as fp: + with open(cls.merges_file, "w") as fp: fp.write("\n".join(merges)) def get_input_output_texts(self, tokenizer): diff --git a/tests/models/layoutlm/test_tokenization_layoutlm.py b/tests/models/layoutlm/test_tokenization_layoutlm.py index eb0e1de626a..7143d8b0e00 100644 --- a/tests/models/layoutlm/test_tokenization_layoutlm.py +++ b/tests/models/layoutlm/test_tokenization_layoutlm.py @@ -16,12 +16,13 @@ import os import unittest +from functools import lru_cache from transformers import LayoutLMTokenizer, LayoutLMTokenizerFast from transformers.models.layoutlm.tokenization_layoutlm import VOCAB_FILES_NAMES from transformers.testing_utils import require_tokenizers -from ...test_tokenization_common import TokenizerTesterMixin +from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible @require_tokenizers @@ -32,8 +33,9 @@ class LayoutLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): test_rust_tokenizer = True space_between_special_tokens = True - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() vocab_tokens = [ "[UNK]", @@ -50,12 +52,16 @@ class LayoutLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): "low", "lowest", ] - self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) - with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: + cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer: vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) - def get_tokenizer(self, **kwargs): - return LayoutLMTokenizer.from_pretrained(self.tmpdirname, **kwargs) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_tokenizer(cls, pretrained_name=None, **kwargs): + pretrained_name = pretrained_name or cls.tmpdirname + return LayoutLMTokenizer.from_pretrained(pretrained_name, **kwargs) def get_input_output_texts(self, tokenizer): input_text = "UNwant\u00e9d,running" diff --git a/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py b/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py index c7594ea78ae..e2271d60c6d 100644 --- a/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py +++ b/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py @@ -102,8 +102,9 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): return questions, words, boxes - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() vocab_tokens = [ "[UNK]", @@ -122,8 +123,8 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): "test", "lowest", ] - self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) - with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: + cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer: vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) def get_input_output_texts(self, tokenizer): @@ -267,7 +268,7 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): def test_offsets_with_special_characters(self): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) words, boxes = self.get_words_and_boxes() words[1] = tokenizer_r.mask_token @@ -605,8 +606,8 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): def test_padding(self, max_length=50): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id) pad_token_id = tokenizer_p.pad_token_id @@ -1060,7 +1061,7 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) # Input tokens id words, boxes = self.get_words_and_boxes() @@ -1363,7 +1364,7 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) words, boxes = self.get_words_and_boxes() @@ -1417,7 +1418,7 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) words, boxes = self.get_words_and_boxes() tokens_r = tokenizer_r.encode_plus( words, @@ -1715,7 +1716,7 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id) pad_token_id = tokenizer_p.pad_token_id diff --git a/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py b/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py index a67c7e4a114..48e36411dbc 100644 --- a/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py +++ b/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py @@ -20,6 +20,7 @@ import re import shutil import tempfile import unittest +from functools import lru_cache from typing import List from parameterized import parameterized @@ -41,7 +42,12 @@ from transformers.testing_utils import ( slow, ) -from ...test_tokenization_common import SMALL_TRAINING_CORPUS, TokenizerTesterMixin, merge_model_tokenizer_mappings +from ...test_tokenization_common import ( + SMALL_TRAINING_CORPUS, + TokenizerTesterMixin, + merge_model_tokenizer_mappings, + use_cache_if_possible, +) logger = logging.get_logger(__name__) @@ -91,8 +97,9 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase): return questions, words, boxes - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt vocab = [ @@ -119,22 +126,30 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase): ] vocab_tokens = dict(zip(vocab, range(len(vocab)))) merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""] - self.special_tokens_map = {"unk_token": ""} + cls.special_tokens_map = {"unk_token": ""} - self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) - self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) - with open(self.vocab_file, "w", encoding="utf-8") as fp: + cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) + with open(cls.vocab_file, "w", encoding="utf-8") as fp: fp.write(json.dumps(vocab_tokens) + "\n") - with open(self.merges_file, "w", encoding="utf-8") as fp: + with open(cls.merges_file, "w", encoding="utf-8") as fp: fp.write("\n".join(merges)) - def get_tokenizer(self, **kwargs): - kwargs.update(self.special_tokens_map) - return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_tokenizer(cls, pretrained_name=None, **kwargs): + kwargs.update(cls.special_tokens_map) + pretrained_name = pretrained_name or cls.tmpdirname + return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs) - def get_rust_tokenizer(self, **kwargs): - kwargs.update(self.special_tokens_map) - return LayoutLMv3TokenizerFast.from_pretrained(self.tmpdirname, **kwargs) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_rust_tokenizer(cls, pretrained_name=None, **kwargs): + kwargs.update(cls.special_tokens_map) + pretrained_name = pretrained_name or cls.tmpdirname + return LayoutLMv3TokenizerFast.from_pretrained(pretrained_name, **kwargs) def get_input_output_texts(self, tokenizer): input_text = "lower newer" @@ -485,8 +500,8 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase): def test_padding(self, max_length=50): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id) pad_token_id = tokenizer_p.pad_token_id @@ -940,7 +955,7 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) # Input tokens id words, boxes = self.get_words_and_boxes() @@ -1241,7 +1256,7 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) words, boxes = self.get_words_and_boxes() @@ -1295,7 +1310,7 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) words, boxes = self.get_words_and_boxes() tokens_r = tokenizer_r.encode_plus( words, @@ -1593,7 +1608,7 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id) pad_token_id = tokenizer_p.pad_token_id diff --git a/tests/models/layoutxlm/test_tokenization_layoutxlm.py b/tests/models/layoutxlm/test_tokenization_layoutxlm.py index d933dca92f3..36f837a89f8 100644 --- a/tests/models/layoutxlm/test_tokenization_layoutxlm.py +++ b/tests/models/layoutxlm/test_tokenization_layoutxlm.py @@ -96,12 +96,13 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): return questions, words, boxes - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() # We have a SentencePiece fixture for testing tokenizer = LayoutXLMTokenizer(SAMPLE_VOCAB, keep_accents=True) - tokenizer.save_pretrained(self.tmpdirname) + tokenizer.save_pretrained(cls.tmpdirname) def get_input_output_texts(self, tokenizer): input_text = "UNwant\u00e9d,running" @@ -157,7 +158,7 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): _, _, boxes = self.get_question_words_and_boxes() with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_rust = self.rust_tokenizer_class.from_pretrained( + tokenizer_rust = self.get_rust_tokenizer( pretrained_name, additional_special_tokens=[special_token], split_special_tokens=True, **kwargs ) tokenizer_py = self.tokenizer_class.from_pretrained( @@ -206,7 +207,7 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): def test_offsets_with_special_characters(self): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) words, boxes = self.get_words_and_boxes() words[1] = tokenizer_r.mask_token @@ -536,8 +537,8 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): def test_padding(self, max_length=50): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id) pad_token_id = tokenizer_p.pad_token_id @@ -990,8 +991,8 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) # Input tokens id words, boxes = self.get_words_and_boxes() @@ -1292,7 +1293,7 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) words, boxes = self.get_words_and_boxes() @@ -1346,7 +1347,7 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) words, boxes = self.get_words_and_boxes() tokens_r = tokenizer_r.encode_plus( words, @@ -1644,7 +1645,7 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id) pad_token_id = tokenizer_p.pad_token_id @@ -1743,7 +1744,7 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) tmpdirname2 = tempfile.mkdtemp() diff --git a/tests/models/led/test_tokenization_led.py b/tests/models/led/test_tokenization_led.py index 7d677bf3f5e..a50acac048d 100644 --- a/tests/models/led/test_tokenization_led.py +++ b/tests/models/led/test_tokenization_led.py @@ -14,13 +14,14 @@ import json import os import unittest +from functools import lru_cache from transformers import BatchEncoding, LEDTokenizer, LEDTokenizerFast from transformers.models.led.tokenization_led import VOCAB_FILES_NAMES from transformers.testing_utils import require_tokenizers, require_torch from transformers.utils import cached_property -from ...test_tokenization_common import TokenizerTesterMixin +from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible @require_tokenizers @@ -30,8 +31,10 @@ class TestTokenizationLED(TokenizerTesterMixin, unittest.TestCase): rust_tokenizer_class = LEDTokenizerFast test_rust_tokenizer = True - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() + vocab = [ "l", "o", @@ -56,22 +59,30 @@ class TestTokenizationLED(TokenizerTesterMixin, unittest.TestCase): ] vocab_tokens = dict(zip(vocab, range(len(vocab)))) merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""] - self.special_tokens_map = {"unk_token": ""} + cls.special_tokens_map = {"unk_token": ""} - self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) - self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) - with open(self.vocab_file, "w", encoding="utf-8") as fp: + cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) + with open(cls.vocab_file, "w", encoding="utf-8") as fp: fp.write(json.dumps(vocab_tokens) + "\n") - with open(self.merges_file, "w", encoding="utf-8") as fp: + with open(cls.merges_file, "w", encoding="utf-8") as fp: fp.write("\n".join(merges)) - def get_tokenizer(self, **kwargs): - kwargs.update(self.special_tokens_map) - return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_tokenizer(cls, pretrained_name=None, **kwargs): + kwargs.update(cls.special_tokens_map) + pretrained_name = pretrained_name or cls.tmpdirname + return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs) - def get_rust_tokenizer(self, **kwargs): - kwargs.update(self.special_tokens_map) - return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_rust_tokenizer(cls, pretrained_name=None, **kwargs): + kwargs.update(cls.special_tokens_map) + pretrained_name = pretrained_name or cls.tmpdirname + return cls.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) def get_input_output_texts(self, tokenizer): return "lower newer", "lower newer" @@ -161,8 +172,8 @@ class TestTokenizationLED(TokenizerTesterMixin, unittest.TestCase): def test_embeded_special_tokens(self): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) sentence = "A, AllenNLP sentence." tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True) tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True) diff --git a/tests/models/llama/test_tokenization_llama.py b/tests/models/llama/test_tokenization_llama.py index 2ae9127dee3..2c0e15bffda 100644 --- a/tests/models/llama/test_tokenization_llama.py +++ b/tests/models/llama/test_tokenization_llama.py @@ -60,13 +60,14 @@ class LlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): test_sentencepiece = True from_pretrained_kwargs = {} - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() # We have a SentencePiece fixture for testing tokenizer = LlamaTokenizer(SAMPLE_VOCAB, keep_accents=True) tokenizer.pad_token = tokenizer.eos_token - tokenizer.save_pretrained(self.tmpdirname) + tokenizer.save_pretrained(cls.tmpdirname) def get_tokenizers(self, **kwargs): kwargs.update({"pad_token": ""}) @@ -149,8 +150,8 @@ class LlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): self.tokenizers_list += (self.rust_tokenizer_class, "hf-internal-testing/llama-tokenizer", {}) for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) tmpdirname2 = tempfile.mkdtemp() @@ -253,7 +254,7 @@ class LlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): added_tokens = [AddedToken("", lstrip=True)] - tokenizer_r = self.rust_tokenizer_class.from_pretrained( + tokenizer_r = self.get_rust_tokenizer( pretrained_name, additional_special_tokens=added_tokens, **kwargs ) r_output = tokenizer_r.encode("Hey this is a token") @@ -263,7 +264,7 @@ class LlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): self.assertTrue(special_token_id in r_output) if self.test_slow_tokenizer: - tokenizer_cr = self.rust_tokenizer_class.from_pretrained( + tokenizer_cr = self.get_rust_tokenizer( pretrained_name, additional_special_tokens=added_tokens, **kwargs, # , from_slow=True <- unfortunately too slow to convert @@ -313,8 +314,8 @@ class LlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): EXPECTED_WITH_SPACE = [1, 18637, 920, 526, 366, 2599] EXPECTED_WO_SPACE = [1, 29950, 1032, 920, 526, 366, 2599] - slow_ = self.tokenizer_class.from_pretrained(pretrained_name, add_prefix_space=False, legacy=False) - fast_ = self.rust_tokenizer_class.from_pretrained(pretrained_name, add_prefix_space=False, legacy=False) + slow_ = self.get_tokenizer(pretrained_name, add_prefix_space=False, legacy=False) + fast_ = self.get_rust_tokenizer(pretrained_name, add_prefix_space=False, legacy=False) self.assertEqual(slow_.encode(inputs), EXPECTED_WO_SPACE) self.assertEqual(slow_.encode(inputs), fast_.encode(inputs)) self.assertEqual(slow_.tokenize(inputs), ["H", "ey", "▁how", "▁are", "▁you", "▁doing"]) @@ -324,8 +325,8 @@ class LlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): fast_.decode(EXPECTED_WO_SPACE, skip_special_tokens=True), ) - slow_ = self.tokenizer_class.from_pretrained(pretrained_name, add_prefix_space=True, legacy=False) - fast_ = self.rust_tokenizer_class.from_pretrained(pretrained_name, add_prefix_space=True, legacy=False) + slow_ = self.get_tokenizer(pretrained_name, add_prefix_space=True, legacy=False) + fast_ = self.get_rust_tokenizer(pretrained_name, add_prefix_space=True, legacy=False) self.assertEqual(slow_.encode(inputs), EXPECTED_WITH_SPACE) self.assertEqual(slow_.encode(inputs), fast_.encode(inputs)) self.assertEqual(slow_.tokenize(inputs), ["▁Hey", "▁how", "▁are", "▁you", "▁doing"]) diff --git a/tests/models/longformer/test_tokenization_longformer.py b/tests/models/longformer/test_tokenization_longformer.py index 65c42a0cab9..303a9ae2d09 100644 --- a/tests/models/longformer/test_tokenization_longformer.py +++ b/tests/models/longformer/test_tokenization_longformer.py @@ -18,12 +18,13 @@ import itertools import json import os import unittest +from functools import lru_cache from transformers import AddedToken, LongformerTokenizer, LongformerTokenizerFast from transformers.models.longformer.tokenization_longformer import VOCAB_FILES_NAMES from transformers.testing_utils import require_tokenizers, slow -from ...test_tokenization_common import TokenizerTesterMixin +from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible @require_tokenizers @@ -36,8 +37,9 @@ class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase): rust_tokenizer_class = LongformerTokenizerFast test_rust_tokenizer = True - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt vocab = [ @@ -64,22 +66,30 @@ class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ] vocab_tokens = dict(zip(vocab, range(len(vocab)))) merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""] - self.special_tokens_map = {"unk_token": ""} + cls.special_tokens_map = {"unk_token": ""} - self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) - self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) - with open(self.vocab_file, "w", encoding="utf-8") as fp: + cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) + with open(cls.vocab_file, "w", encoding="utf-8") as fp: fp.write(json.dumps(vocab_tokens) + "\n") - with open(self.merges_file, "w", encoding="utf-8") as fp: + with open(cls.merges_file, "w", encoding="utf-8") as fp: fp.write("\n".join(merges)) - def get_tokenizer(self, **kwargs): - kwargs.update(self.special_tokens_map) - return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_tokenizer(cls, pretrained_name=None, **kwargs): + kwargs.update(cls.special_tokens_map) + pretrained_name = pretrained_name or cls.tmpdirname + return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs) - def get_rust_tokenizer(self, **kwargs): - kwargs.update(self.special_tokens_map) - return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_rust_tokenizer(cls, pretrained_name=None, **kwargs): + kwargs.update(cls.special_tokens_map) + pretrained_name = pretrained_name or cls.tmpdirname + return cls.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) def get_input_output_texts(self, tokenizer): input_text = "lower newer" @@ -173,8 +183,8 @@ class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase): def test_embeded_special_tokens(self): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) sentence = "A, AllenNLP sentence." tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True) tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True) @@ -204,7 +214,7 @@ class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase): def test_change_add_prefix_space_and_trim_offsets_args(self): for trim_offsets, add_prefix_space in itertools.product([True, False], repeat=2): - tokenizer_r = self.rust_tokenizer_class.from_pretrained( + tokenizer_r = self.get_rust_tokenizer( self.tmpdirname, use_fast=True, add_prefix_space=add_prefix_space, trim_offsets=trim_offsets ) @@ -224,7 +234,7 @@ class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase): text_of_1_token = "hello" # `hello` is a token in the vocabulary of `pretrained_name` text = f"{text_of_1_token} {text_of_1_token}" - tokenizer_r = self.rust_tokenizer_class.from_pretrained( + tokenizer_r = self.get_rust_tokenizer( pretrained_name, use_fast=True, add_prefix_space=True, trim_offsets=True ) encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False) @@ -234,7 +244,7 @@ class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase): (len(text_of_1_token) + 1, len(text_of_1_token) + 1 + len(text_of_1_token)), ) - tokenizer_r = self.rust_tokenizer_class.from_pretrained( + tokenizer_r = self.get_rust_tokenizer( pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=True ) encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False) @@ -244,7 +254,7 @@ class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase): (len(text_of_1_token) + 1, len(text_of_1_token) + 1 + len(text_of_1_token)), ) - tokenizer_r = self.rust_tokenizer_class.from_pretrained( + tokenizer_r = self.get_rust_tokenizer( pretrained_name, use_fast=True, add_prefix_space=True, trim_offsets=False ) encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False) @@ -254,7 +264,7 @@ class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase): (len(text_of_1_token), len(text_of_1_token) + 1 + len(text_of_1_token)), ) - tokenizer_r = self.rust_tokenizer_class.from_pretrained( + tokenizer_r = self.get_rust_tokenizer( pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=False ) encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False) @@ -276,7 +286,7 @@ class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase): # (1 + len(text_of_1_token) + 1, 1 + len(text_of_1_token) + 1 + len(text_of_1_token)), # ) - tokenizer_r = self.rust_tokenizer_class.from_pretrained( + tokenizer_r = self.get_rust_tokenizer( pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=True ) encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False) @@ -286,7 +296,7 @@ class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase): (1 + len(text_of_1_token) + 1, 1 + len(text_of_1_token) + 1 + len(text_of_1_token)), ) - tokenizer_r = self.rust_tokenizer_class.from_pretrained( + tokenizer_r = self.get_rust_tokenizer( pretrained_name, use_fast=True, add_prefix_space=True, trim_offsets=False ) encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False) @@ -296,7 +306,7 @@ class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase): (1 + len(text_of_1_token), 1 + len(text_of_1_token) + 1 + len(text_of_1_token)), ) - tokenizer_r = self.rust_tokenizer_class.from_pretrained( + tokenizer_r = self.get_rust_tokenizer( pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=False ) encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False) diff --git a/tests/models/luke/test_tokenization_luke.py b/tests/models/luke/test_tokenization_luke.py index a648f28d1ff..b935f3e49c4 100644 --- a/tests/models/luke/test_tokenization_luke.py +++ b/tests/models/luke/test_tokenization_luke.py @@ -14,12 +14,13 @@ # limitations under the License. import unittest +from functools import lru_cache from typing import Tuple from transformers import AddedToken, LukeTokenizer from transformers.testing_utils import get_tests_dir, require_torch, slow -from ...test_tokenization_common import TokenizerTesterMixin +from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible SAMPLE_VOCAB = get_tests_dir("fixtures/vocab.json") @@ -33,13 +34,17 @@ class LukeTokenizerTest(TokenizerTesterMixin, unittest.TestCase): test_rust_tokenizer = False from_pretrained_kwargs = {"cls_token": ""} - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() - self.special_tokens_map = {"entity_token_1": "", "entity_token_2": ""} + cls.special_tokens_map = {"entity_token_1": "", "entity_token_2": ""} - def get_tokenizer(self, task=None, **kwargs): - kwargs.update(self.special_tokens_map) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_tokenizer(cls, task=None, **kwargs): + kwargs.update(cls.special_tokens_map) tokenizer = LukeTokenizer( vocab_file=SAMPLE_VOCAB, merges_file=SAMPLE_MERGE_FILE, @@ -137,8 +142,8 @@ class LukeTokenizerTest(TokenizerTesterMixin, unittest.TestCase): def test_embeded_special_tokens(self): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) sentence = "A, AllenNLP sentence." tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True) tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True) diff --git a/tests/models/lxmert/test_tokenization_lxmert.py b/tests/models/lxmert/test_tokenization_lxmert.py index 6f1c5306ff3..b634d259ed5 100644 --- a/tests/models/lxmert/test_tokenization_lxmert.py +++ b/tests/models/lxmert/test_tokenization_lxmert.py @@ -32,8 +32,9 @@ class LxmertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): test_rust_tokenizer = True space_between_special_tokens = True - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() vocab_tokens = [ "[UNK]", @@ -50,8 +51,8 @@ class LxmertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): "low", "lowest", ] - self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) - with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: + cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer: vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) def get_input_output_texts(self, tokenizer): diff --git a/tests/models/m2m_100/test_tokenization_m2m_100.py b/tests/models/m2m_100/test_tokenization_m2m_100.py index 76cadf2f3bc..0632eaf2487 100644 --- a/tests/models/m2m_100/test_tokenization_m2m_100.py +++ b/tests/models/m2m_100/test_tokenization_m2m_100.py @@ -14,6 +14,7 @@ import tempfile import unittest +from functools import lru_cache from pathlib import Path from shutil import copyfile @@ -32,7 +33,7 @@ from transformers.utils import is_sentencepiece_available if is_sentencepiece_available(): from transformers.models.m2m_100.tokenization_m2m_100 import VOCAB_FILES_NAMES, save_json -from ...test_tokenization_common import TokenizerTesterMixin +from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible if is_sentencepiece_available(): @@ -54,21 +55,26 @@ class M2M100TokenizationTest(TokenizerTesterMixin, unittest.TestCase): test_seq2seq = False test_sentencepiece = True - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() vocab = ["", "", "▁This", "▁is", "▁a", "▁t", "est", "\u0120", ""] vocab_tokens = dict(zip(vocab, range(len(vocab)))) - save_dir = Path(self.tmpdirname) + save_dir = Path(cls.tmpdirname) save_json(vocab_tokens, save_dir / VOCAB_FILES_NAMES["vocab_file"]) if not (save_dir / VOCAB_FILES_NAMES["spm_file"]).exists(): copyfile(SAMPLE_SP, save_dir / VOCAB_FILES_NAMES["spm_file"]) - tokenizer = M2M100Tokenizer.from_pretrained(self.tmpdirname) - tokenizer.save_pretrained(self.tmpdirname) + tokenizer = M2M100Tokenizer.from_pretrained(cls.tmpdirname) + tokenizer.save_pretrained(cls.tmpdirname) - def get_tokenizer(self, **kwargs): - return M2M100Tokenizer.from_pretrained(self.tmpdirname, **kwargs) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_tokenizer(cls, pretrained_name=None, **kwargs): + pretrained_name = pretrained_name or cls.tmpdirname + return M2M100Tokenizer.from_pretrained(pretrained_name, **kwargs) def get_input_output_texts(self, tokenizer): return ( diff --git a/tests/models/marian/test_tokenization_marian.py b/tests/models/marian/test_tokenization_marian.py index 3ef85e24de6..03814663604 100644 --- a/tests/models/marian/test_tokenization_marian.py +++ b/tests/models/marian/test_tokenization_marian.py @@ -15,6 +15,7 @@ import tempfile import unittest +from functools import lru_cache from pathlib import Path from shutil import copyfile @@ -26,7 +27,7 @@ from transformers.utils import is_sentencepiece_available, is_tf_available, is_t if is_sentencepiece_available(): from transformers.models.marian.tokenization_marian import VOCAB_FILES_NAMES, save_json -from ...test_tokenization_common import TokenizerTesterMixin +from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible SAMPLE_SP = get_tests_dir("fixtures/test_sentencepiece.model") @@ -50,22 +51,28 @@ class MarianTokenizationTest(TokenizerTesterMixin, unittest.TestCase): test_rust_tokenizer = False test_sentencepiece = True - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() + vocab = ["", "", "▁This", "▁is", "▁a", "▁t", "est", "\u0120", ""] vocab_tokens = dict(zip(vocab, range(len(vocab)))) - save_dir = Path(self.tmpdirname) + save_dir = Path(cls.tmpdirname) save_json(vocab_tokens, save_dir / VOCAB_FILES_NAMES["vocab"]) save_json(mock_tokenizer_config, save_dir / VOCAB_FILES_NAMES["tokenizer_config_file"]) if not (save_dir / VOCAB_FILES_NAMES["source_spm"]).exists(): copyfile(SAMPLE_SP, save_dir / VOCAB_FILES_NAMES["source_spm"]) copyfile(SAMPLE_SP, save_dir / VOCAB_FILES_NAMES["target_spm"]) - tokenizer = MarianTokenizer.from_pretrained(self.tmpdirname) - tokenizer.save_pretrained(self.tmpdirname) + tokenizer = MarianTokenizer.from_pretrained(cls.tmpdirname) + tokenizer.save_pretrained(cls.tmpdirname) - def get_tokenizer(self, **kwargs) -> MarianTokenizer: - return MarianTokenizer.from_pretrained(self.tmpdirname, **kwargs) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_tokenizer(cls, pretrained_name=None, **kwargs) -> MarianTokenizer: + pretrained_name = pretrained_name or cls.tmpdirname + return MarianTokenizer.from_pretrained(pretrained_name, **kwargs) def get_input_output_texts(self, tokenizer): return ( diff --git a/tests/models/markuplm/test_tokenization_markuplm.py b/tests/models/markuplm/test_tokenization_markuplm.py index d2e4614dfcd..c7cb39964fa 100644 --- a/tests/models/markuplm/test_tokenization_markuplm.py +++ b/tests/models/markuplm/test_tokenization_markuplm.py @@ -50,26 +50,27 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): from_pretrained_kwargs = {"cls_token": ""} test_seq2seq = False - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "\u0120", "\u0120l", "\u0120n", "\u0120lo", "\u0120low", "er", "\u0120lowest", "\u0120newer", "\u0120wider", "\u0120hello", "\u0120world", "",] # fmt: skip vocab_tokens = dict(zip(vocab, range(len(vocab)))) merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""] - self.tags_dict = {"a": 0, "abbr": 1, "acronym": 2, "address": 3} - self.special_tokens_map = {"unk_token": ""} + cls.tags_dict = {"a": 0, "abbr": 1, "acronym": 2, "address": 3} + cls.special_tokens_map = {"unk_token": ""} - self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) - self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) - self.tokenizer_config_file = os.path.join(self.tmpdirname, "tokenizer_config.json") + cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) + cls.tokenizer_config_file = os.path.join(cls.tmpdirname, "tokenizer_config.json") - with open(self.vocab_file, "w", encoding="utf-8") as fp: + with open(cls.vocab_file, "w", encoding="utf-8") as fp: fp.write(json.dumps(vocab_tokens) + "\n") - with open(self.merges_file, "w", encoding="utf-8") as fp: + with open(cls.merges_file, "w", encoding="utf-8") as fp: fp.write("\n".join(merges)) - with open(self.tokenizer_config_file, "w", encoding="utf-8") as fp: - fp.write(json.dumps({"tags_dict": self.tags_dict})) + with open(cls.tokenizer_config_file, "w", encoding="utf-8") as fp: + fp.write(json.dumps({"tags_dict": cls.tags_dict})) def get_nodes_and_xpaths(self): nodes = ["hello", "world"] @@ -421,8 +422,8 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): def test_padding(self, max_length=50): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id) pad_token_id = tokenizer_p.pad_token_id @@ -828,8 +829,8 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) # Input tokens id nodes, xpaths = self.get_nodes_and_xpaths() @@ -1010,7 +1011,7 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): def test_offsets_mapping(self): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) text = ["a", "wonderful", "test"] xpaths = ["html/body" for _ in range(len(text))] @@ -1125,7 +1126,7 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) nodes, xpaths = self.get_nodes_and_xpaths() @@ -1187,7 +1188,7 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) nodes, xpaths = self.get_nodes_and_xpaths() tokens_r = tokenizer_r.encode_plus(nodes, xpaths=xpaths, add_special_tokens=True) tokens_p = tokenizer_p.encode_plus(nodes, xpaths=xpaths, add_special_tokens=True) @@ -1490,7 +1491,7 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id) pad_token_id = tokenizer_p.pad_token_id diff --git a/tests/models/mbart/test_tokenization_mbart.py b/tests/models/mbart/test_tokenization_mbart.py index bbe800357e7..f219965ae52 100644 --- a/tests/models/mbart/test_tokenization_mbart.py +++ b/tests/models/mbart/test_tokenization_mbart.py @@ -47,12 +47,13 @@ class MBartTokenizationTest(TokenizerTesterMixin, unittest.TestCase): test_rust_tokenizer = True test_sentencepiece = True - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() # We have a SentencePiece fixture for testing tokenizer = MBartTokenizer(SAMPLE_VOCAB, keep_accents=True) - tokenizer.save_pretrained(self.tmpdirname) + tokenizer.save_pretrained(cls.tmpdirname) def test_full_tokenizer(self): tokenizer = MBartTokenizer(SAMPLE_VOCAB, keep_accents=True) @@ -139,8 +140,8 @@ class MBartTokenizationTest(TokenizerTesterMixin, unittest.TestCase): self.tokenizers_list[0] = (self.rust_tokenizer_class, "hf-internal-testing/tiny-random-mbart", {}) for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) tmpdirname2 = tempfile.mkdtemp() diff --git a/tests/models/mbart50/test_tokenization_mbart50.py b/tests/models/mbart50/test_tokenization_mbart50.py index cd86bcf623a..bed8b8cb376 100644 --- a/tests/models/mbart50/test_tokenization_mbart50.py +++ b/tests/models/mbart50/test_tokenization_mbart50.py @@ -47,12 +47,13 @@ class MBart50TokenizationTest(TokenizerTesterMixin, unittest.TestCase): test_rust_tokenizer = True test_sentencepiece = True - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() # We have a SentencePiece fixture for testing tokenizer = MBart50Tokenizer(SAMPLE_VOCAB, src_lang="en_XX", tgt_lang="ro_RO", keep_accents=True) - tokenizer.save_pretrained(self.tmpdirname) + tokenizer.save_pretrained(cls.tmpdirname) def test_convert_token_and_id(self): """Test ``_convert_token_to_id`` and ``_convert_id_to_token``.""" @@ -117,8 +118,8 @@ class MBart50TokenizationTest(TokenizerTesterMixin, unittest.TestCase): self.tokenizers_list[0] = (self.rust_tokenizer_class, "hf-internal-testing/tiny-random-mbart50", {}) for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) tmpdirname2 = tempfile.mkdtemp() diff --git a/tests/models/mgp_str/test_tokenization_mgp_str.py b/tests/models/mgp_str/test_tokenization_mgp_str.py index 91ec39f027a..2d021606ffc 100644 --- a/tests/models/mgp_str/test_tokenization_mgp_str.py +++ b/tests/models/mgp_str/test_tokenization_mgp_str.py @@ -17,12 +17,13 @@ import json import os import unittest +from functools import lru_cache from transformers import MgpstrTokenizer from transformers.models.mgp_str.tokenization_mgp_str import VOCAB_FILES_NAMES from transformers.testing_utils import require_tokenizers -from ...test_tokenization_common import TokenizerTesterMixin +from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible @require_tokenizers @@ -33,18 +34,23 @@ class MgpstrTokenizationTest(TokenizerTesterMixin, unittest.TestCase): from_pretrained_kwargs = {} test_seq2seq = False - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() vocab = ['[GO]', '[s]', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'] # fmt: skip vocab_tokens = dict(zip(vocab, range(len(vocab)))) - self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) - with open(self.vocab_file, "w", encoding="utf-8") as fp: + cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + with open(cls.vocab_file, "w", encoding="utf-8") as fp: fp.write(json.dumps(vocab_tokens) + "\n") - def get_tokenizer(self, **kwargs): - return MgpstrTokenizer.from_pretrained(self.tmpdirname, **kwargs) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_tokenizer(cls, pretrained_name=None, **kwargs): + pretrained_name = pretrained_name or cls.tmpdirname + return MgpstrTokenizer.from_pretrained(pretrained_name, **kwargs) def get_input_output_texts(self, tokenizer): input_text = "tester" diff --git a/tests/models/mluke/test_tokenization_mluke.py b/tests/models/mluke/test_tokenization_mluke.py index 7af516849f9..a86ea690a46 100644 --- a/tests/models/mluke/test_tokenization_mluke.py +++ b/tests/models/mluke/test_tokenization_mluke.py @@ -15,12 +15,13 @@ import unittest +from functools import lru_cache from typing import Tuple from transformers.models.mluke.tokenization_mluke import MLukeTokenizer from transformers.testing_utils import get_tests_dir, require_torch, slow -from ...test_tokenization_common import TokenizerTesterMixin +from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") @@ -33,13 +34,17 @@ class MLukeTokenizerTest(TokenizerTesterMixin, unittest.TestCase): test_rust_tokenizer = False from_pretrained_kwargs = {"cls_token": ""} - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() - self.special_tokens_map = {"entity_token_1": "", "entity_token_2": ""} + cls.special_tokens_map = {"entity_token_1": "", "entity_token_2": ""} - def get_tokenizer(self, task=None, **kwargs): - kwargs.update(self.special_tokens_map) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_tokenizer(cls, task=None, **kwargs): + kwargs.update(cls.special_tokens_map) kwargs.update({"task": task}) tokenizer = MLukeTokenizer(vocab_file=SAMPLE_VOCAB, entity_vocab_file=SAMPLE_ENTITY_VOCAB, **kwargs) return tokenizer @@ -100,8 +105,8 @@ class MLukeTokenizerTest(TokenizerTesterMixin, unittest.TestCase): def test_embeded_special_tokens(self): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) sentence = "A, AllenNLP sentence." tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True) tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True) diff --git a/tests/models/mobilebert/test_tokenization_mobilebert.py b/tests/models/mobilebert/test_tokenization_mobilebert.py index 2a5c250b849..e0f4bff647e 100644 --- a/tests/models/mobilebert/test_tokenization_mobilebert.py +++ b/tests/models/mobilebert/test_tokenization_mobilebert.py @@ -41,8 +41,9 @@ class MobileBERTTokenizationTest(TokenizerTesterMixin, unittest.TestCase): from_pretrained_filter = filter_non_english pre_trained_model_path = "google/mobilebert-uncased" - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() vocab_tokens = [ "[UNK]", @@ -61,13 +62,13 @@ class MobileBERTTokenizationTest(TokenizerTesterMixin, unittest.TestCase): "low", "lowest", ] - self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) - with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: + cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer: vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) - self.tokenizers_list = [ - (tokenizer_def[0], self.pre_trained_model_path, tokenizer_def[2]) # else the 'google/' prefix is stripped - for tokenizer_def in self.tokenizers_list + cls.tokenizers_list = [ + (tokenizer_def[0], cls.pre_trained_model_path, tokenizer_def[2]) # else the 'google/' prefix is stripped + for tokenizer_def in cls.tokenizers_list ] # Copied from tests.models.bert.test_tokenization_bert.BertTokenizationTest.get_input_output_texts @@ -275,7 +276,7 @@ class MobileBERTTokenizationTest(TokenizerTesterMixin, unittest.TestCase): def test_offsets_with_special_characters(self): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence." tokens = tokenizer_r.encode_plus( @@ -331,8 +332,8 @@ class MobileBERTTokenizationTest(TokenizerTesterMixin, unittest.TestCase): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): kwargs["tokenize_chinese_chars"] = True - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False) ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False) @@ -345,8 +346,8 @@ class MobileBERTTokenizationTest(TokenizerTesterMixin, unittest.TestCase): self.assertListEqual(tokens_without_spe_char_r, list_of_commun_chinese_char) kwargs["tokenize_chinese_chars"] = False - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False) ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False) diff --git a/tests/models/moshi/test_tokenization_moshi.py b/tests/models/moshi/test_tokenization_moshi.py index 0aaa6295ea6..740cfd55d37 100644 --- a/tests/models/moshi/test_tokenization_moshi.py +++ b/tests/models/moshi/test_tokenization_moshi.py @@ -51,8 +51,9 @@ class MoshiTokenizationTest(TokenizerTesterMixin, unittest.TestCase): test_rust_tokenizer = True from_pretrained_kwargs = {} - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() # We have a SentencePiece fixture for testing tokenizer = PreTrainedTokenizerFast( @@ -62,10 +63,11 @@ class MoshiTokenizationTest(TokenizerTesterMixin, unittest.TestCase): eos_token="", ) tokenizer.pad_token = tokenizer.eos_token - tokenizer.save_pretrained(self.tmpdirname) + tokenizer.save_pretrained(cls.tmpdirname) - def get_rust_tokenizer(self, **kwargs) -> PreTrainedTokenizerFast: - return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) + def get_rust_tokenizer(cls, pretrained_name=None, **kwargs) -> PreTrainedTokenizerFast: + pretrained_name = pretrained_name or cls.tmpdirname + return cls.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) @unittest.skip(reason="No slow tokenizer") def test_added_tokens_serialization(self): diff --git a/tests/models/mpnet/test_tokenization_mpnet.py b/tests/models/mpnet/test_tokenization_mpnet.py index f1049f8ef54..2b934a9ed68 100644 --- a/tests/models/mpnet/test_tokenization_mpnet.py +++ b/tests/models/mpnet/test_tokenization_mpnet.py @@ -32,8 +32,9 @@ class MPNetTokenizerTest(TokenizerTesterMixin, unittest.TestCase): test_rust_tokenizer = True space_between_special_tokens = True - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() vocab_tokens = [ "[UNK]", @@ -52,8 +53,8 @@ class MPNetTokenizerTest(TokenizerTesterMixin, unittest.TestCase): "low", "lowest", ] - self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) - with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: + cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer: vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) def get_input_output_texts(self, tokenizer): diff --git a/tests/models/mvp/test_tokenization_mvp.py b/tests/models/mvp/test_tokenization_mvp.py index 9320f8f020d..af44cc961c5 100644 --- a/tests/models/mvp/test_tokenization_mvp.py +++ b/tests/models/mvp/test_tokenization_mvp.py @@ -14,13 +14,14 @@ import json import os import unittest +from functools import lru_cache from transformers import BatchEncoding, MvpTokenizer, MvpTokenizerFast from transformers.models.roberta.tokenization_roberta import VOCAB_FILES_NAMES from transformers.testing_utils import require_tokenizers, require_torch from transformers.utils import cached_property -from ...test_tokenization_common import TokenizerTesterMixin, filter_roberta_detectors +from ...test_tokenization_common import TokenizerTesterMixin, filter_roberta_detectors, use_cache_if_possible @require_tokenizers @@ -32,8 +33,10 @@ class TestTokenizationMvp(TokenizerTesterMixin, unittest.TestCase): from_pretrained_filter = filter_roberta_detectors # from_pretrained_kwargs = {'add_prefix_space': True} - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() + vocab = [ "l", "o", @@ -58,22 +61,30 @@ class TestTokenizationMvp(TokenizerTesterMixin, unittest.TestCase): ] vocab_tokens = dict(zip(vocab, range(len(vocab)))) merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""] - self.special_tokens_map = {"unk_token": ""} + cls.special_tokens_map = {"unk_token": ""} - self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) - self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) - with open(self.vocab_file, "w", encoding="utf-8") as fp: + cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) + with open(cls.vocab_file, "w", encoding="utf-8") as fp: fp.write(json.dumps(vocab_tokens) + "\n") - with open(self.merges_file, "w", encoding="utf-8") as fp: + with open(cls.merges_file, "w", encoding="utf-8") as fp: fp.write("\n".join(merges)) - def get_tokenizer(self, **kwargs): - kwargs.update(self.special_tokens_map) - return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_tokenizer(cls, pretrained_name=None, **kwargs): + kwargs.update(cls.special_tokens_map) + pretrained_name = pretrained_name or cls.tmpdirname + return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs) - def get_rust_tokenizer(self, **kwargs): - kwargs.update(self.special_tokens_map) - return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_rust_tokenizer(cls, pretrained_name=None, **kwargs): + kwargs.update(cls.special_tokens_map) + pretrained_name = pretrained_name or cls.tmpdirname + return cls.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) def get_input_output_texts(self, tokenizer): return "lower newer", "lower newer" @@ -153,8 +164,8 @@ class TestTokenizationMvp(TokenizerTesterMixin, unittest.TestCase): def test_embeded_special_tokens(self): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) sentence = "A, AllenNLP sentence." tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True) tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True) diff --git a/tests/models/myt5/test_tokenization_myt5.py b/tests/models/myt5/test_tokenization_myt5.py index 36e10ac36da..aab67978f21 100644 --- a/tests/models/myt5/test_tokenization_myt5.py +++ b/tests/models/myt5/test_tokenization_myt5.py @@ -16,6 +16,7 @@ import binascii import unittest from transformers import MyT5Tokenizer +from transformers.testing_utils import slow from transformers.utils import is_tf_available, is_torch_available from ...test_tokenization_common import TokenizerTesterMixin @@ -86,15 +87,14 @@ class TestByteRewriter(unittest.TestCase): self.assertEqual(decompose_rewriter.rewrite_bytes(in_hex), out_hex) +# This is way too slow, let's not run it on CircleCI. When trying to use cache, we get OOM and worker(s) crashed. +@slow class MyT5TokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = MyT5Tokenizer test_rust_tokenizer = False - def setUp(self): - super().setUp() - - def get_tokenizer(self, **kwargs) -> MyT5Tokenizer: - return self.tokenizer_class.from_pretrained("Tomlim/myt5-base", **kwargs) + def get_tokenizer(cls, **kwargs) -> MyT5Tokenizer: + return cls.tokenizer_class.from_pretrained("Tomlim/myt5-base", **kwargs) @unittest.skip(reason="inputs cannot be pretokenized as ids depend on whole input string") def test_pretokenized_inputs(self): diff --git a/tests/models/nllb/test_tokenization_nllb.py b/tests/models/nllb/test_tokenization_nllb.py index 1759731bc68..0e9f37d5945 100644 --- a/tests/models/nllb/test_tokenization_nllb.py +++ b/tests/models/nllb/test_tokenization_nllb.py @@ -56,12 +56,13 @@ class NllbTokenizationTest(TokenizerTesterMixin, unittest.TestCase): test_sentencepiece = True from_pretrained_kwargs = {} - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() # We have a SentencePiece fixture for testing tokenizer = NllbTokenizer(SAMPLE_VOCAB, keep_accents=True) - tokenizer.save_pretrained(self.tmpdirname) + tokenizer.save_pretrained(cls.tmpdirname) def test_full_tokenizer(self): tokenizer = NllbTokenizer(SAMPLE_VOCAB, keep_accents=True) @@ -143,8 +144,8 @@ class NllbTokenizationTest(TokenizerTesterMixin, unittest.TestCase): self.tokenizers_list[0] = (self.rust_tokenizer_class, "hf-internal-testing/tiny-random-nllb", {}) for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) tmpdirname2 = tempfile.mkdtemp() @@ -262,7 +263,7 @@ class NllbTokenizationTest(TokenizerTesterMixin, unittest.TestCase): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): added_tokens = [AddedToken("", lstrip=True)] - tokenizer_r = self.rust_tokenizer_class.from_pretrained( + tokenizer_r = self.get_rust_tokenizer( pretrained_name, additional_special_tokens=added_tokens, **kwargs ) r_output = tokenizer_r.encode("Hey this is a token") @@ -272,7 +273,7 @@ class NllbTokenizationTest(TokenizerTesterMixin, unittest.TestCase): self.assertTrue(special_token_id in r_output) if self.test_slow_tokenizer: - tokenizer_cr = self.rust_tokenizer_class.from_pretrained( + tokenizer_cr = self.get_rust_tokenizer( pretrained_name, additional_special_tokens=added_tokens, **kwargs, # , from_slow=True <- unfortunately too slow to convert diff --git a/tests/models/nougat/test_tokenization_nougat.py b/tests/models/nougat/test_tokenization_nougat.py index 38a9e3ba9c0..c5da1f0291b 100644 --- a/tests/models/nougat/test_tokenization_nougat.py +++ b/tests/models/nougat/test_tokenization_nougat.py @@ -13,13 +13,15 @@ # See the License for the specific language governing permissions and # limitations under the License. +import copy import unittest +from functools import lru_cache from transformers import NougatTokenizerFast from transformers.models.nougat.tokenization_nougat_fast import markdown_compatible, normalize_list_like_lines from transformers.testing_utils import require_levenshtein, require_nltk, require_tokenizers -from ...test_tokenization_common import TokenizerTesterMixin +from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible @require_tokenizers @@ -33,19 +35,26 @@ class NougatTokenizationTest(TokenizerTesterMixin, unittest.TestCase): from_pretrained_vocab_key = "tokenizer_file" special_tokens_map = {"bos_token": "", "eos_token": "", "unk_token": "", "pad_token": ""} - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() tokenizer = NougatTokenizerFast.from_pretrained("facebook/nougat-base") - tokenizer.save_pretrained(self.tmpdirname) + tokenizer.save_pretrained(cls.tmpdirname) - def get_rust_tokenizer(self, **kwargs): - kwargs.update(self.special_tokens_map) - return NougatTokenizerFast.from_pretrained(self.tmpdirname, **kwargs) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_rust_tokenizer(cls, pretrained_name=None, **kwargs): + _kwargs = copy.deepcopy(cls.special_tokens_map) + _kwargs.update(kwargs) + kwargs = _kwargs + pretrained_name = pretrained_name or cls.tmpdirname + return NougatTokenizerFast.from_pretrained(pretrained_name, **kwargs) def test_padding(self, max_length=6): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) # Simple input sentence1 = "This is a simple input" sentence2 = ["This is a simple input 1", "This is a simple input 2"] diff --git a/tests/models/openai/test_tokenization_openai.py b/tests/models/openai/test_tokenization_openai.py index 5c8a76a5ae4..e91765f93d4 100644 --- a/tests/models/openai/test_tokenization_openai.py +++ b/tests/models/openai/test_tokenization_openai.py @@ -35,8 +35,9 @@ class OpenAIGPTTokenizationTest(TokenizerTesterMixin, unittest.TestCase): test_rust_tokenizer = True test_seq2seq = False - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt vocab = [ @@ -65,11 +66,11 @@ class OpenAIGPTTokenizationTest(TokenizerTesterMixin, unittest.TestCase): vocab_tokens = dict(zip(vocab, range(len(vocab)))) merges = ["#version: 0.2", "l o", "lo w", "e r", ""] - self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) - self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) - with open(self.vocab_file, "w") as fp: + cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) + with open(cls.vocab_file, "w") as fp: fp.write(json.dumps(vocab_tokens)) - with open(self.merges_file, "w") as fp: + with open(cls.merges_file, "w") as fp: fp.write("\n".join(merges)) def get_input_output_texts(self, tokenizer): @@ -90,7 +91,7 @@ class OpenAIGPTTokenizationTest(TokenizerTesterMixin, unittest.TestCase): def test_padding(self, max_length=15): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) # Simple input s = "This is a simple input" diff --git a/tests/models/pegasus/test_tokenization_pegasus.py b/tests/models/pegasus/test_tokenization_pegasus.py index bb52b8c4731..35292d5f240 100644 --- a/tests/models/pegasus/test_tokenization_pegasus.py +++ b/tests/models/pegasus/test_tokenization_pegasus.py @@ -13,12 +13,13 @@ # limitations under the License. import unittest +from functools import lru_cache from transformers import PegasusTokenizer, PegasusTokenizerFast from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, require_torch, slow from transformers.utils import cached_property -from ...test_tokenization_common import TokenizerTesterMixin +from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece_no_bos.model") @@ -33,19 +34,24 @@ class PegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase): test_rust_tokenizer = True test_sentencepiece = True - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() # We have a SentencePiece fixture for testing tokenizer = PegasusTokenizer(SAMPLE_VOCAB) - tokenizer.save_pretrained(self.tmpdirname) + tokenizer.save_pretrained(cls.tmpdirname) @cached_property def _large_tokenizer(self): return PegasusTokenizer.from_pretrained("google/pegasus-large") - def get_tokenizer(self, **kwargs) -> PegasusTokenizer: - return PegasusTokenizer.from_pretrained(self.tmpdirname, **kwargs) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_tokenizer(cls, pretrained_name=None, **kwargs) -> PegasusTokenizer: + pretrained_name = pretrained_name or cls.tmpdirname + return PegasusTokenizer.from_pretrained(pretrained_name, **kwargs) def get_input_output_texts(self, tokenizer): return ("This is a test", "This is a test") @@ -70,8 +76,8 @@ class PegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase): self.assertEqual(self.get_tokenizer().vocab_size, 1_103) def test_mask_tokens_rust_pegasus(self): - rust_tokenizer = self.rust_tokenizer_class.from_pretrained(self.tmpdirname) - py_tokenizer = self.tokenizer_class.from_pretrained(self.tmpdirname) + rust_tokenizer = self.get_rust_tokenizer(self.tmpdirname) + py_tokenizer = self.get_tokenizer(self.tmpdirname) raw_input_str = ( "Let's see which is the better one It seems like this was important" " " @@ -138,26 +144,31 @@ class BigBirdPegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase): test_rust_tokenizer = True test_sentencepiece = True - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() # We have a SentencePiece fixture for testing tokenizer = PegasusTokenizer(SAMPLE_VOCAB, offset=0, mask_token_sent=None, mask_token="[MASK]") - tokenizer.save_pretrained(self.tmpdirname) + tokenizer.save_pretrained(cls.tmpdirname) @cached_property def _large_tokenizer(self): return PegasusTokenizer.from_pretrained("google/bigbird-pegasus-large-arxiv") - def get_tokenizer(self, **kwargs) -> PegasusTokenizer: - return PegasusTokenizer.from_pretrained(self.tmpdirname, **kwargs) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_tokenizer(cls, pretrained_name=None, **kwargs) -> PegasusTokenizer: + pretrained_name = pretrained_name or cls.tmpdirname + return PegasusTokenizer.from_pretrained(pretrained_name, **kwargs) def get_input_output_texts(self, tokenizer): return ("This is a test", "This is a test") def test_mask_tokens_rust_pegasus(self): - rust_tokenizer = self.rust_tokenizer_class.from_pretrained(self.tmpdirname) - py_tokenizer = self.tokenizer_class.from_pretrained(self.tmpdirname) + rust_tokenizer = self.get_rust_tokenizer(self.tmpdirname) + py_tokenizer = self.get_tokenizer(self.tmpdirname) raw_input_str = ( "Let's see which is the better one [MASK] It seems like this [MASK] was important " " " diff --git a/tests/models/perceiver/test_tokenization_perceiver.py b/tests/models/perceiver/test_tokenization_perceiver.py index de9bf36b434..16c279ae18a 100644 --- a/tests/models/perceiver/test_tokenization_perceiver.py +++ b/tests/models/perceiver/test_tokenization_perceiver.py @@ -19,12 +19,13 @@ import re import shutil import tempfile import unittest +from functools import lru_cache from typing import Tuple from transformers import AddedToken, BatchEncoding, PerceiverTokenizer from transformers.utils import cached_property, is_tf_available, is_torch_available -from ...test_tokenization_common import TokenizerTesterMixin +from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible if is_torch_available(): @@ -40,17 +41,22 @@ class PerceiverTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = PerceiverTokenizer test_rust_tokenizer = False - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() tokenizer = PerceiverTokenizer() - tokenizer.save_pretrained(self.tmpdirname) + tokenizer.save_pretrained(cls.tmpdirname) @cached_property def perceiver_tokenizer(self): return PerceiverTokenizer.from_pretrained("deepmind/language-perceiver") - def get_tokenizer(self, **kwargs) -> PerceiverTokenizer: - return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_tokenizer(cls, pretrained_name=None, **kwargs) -> PerceiverTokenizer: + pretrained_name = pretrained_name or cls.tmpdirname + return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs) def get_clean_sequence(self, tokenizer, with_prefix_space=False, max_length=20, min_length=5) -> Tuple[str, list]: # XXX The default common tokenizer tests assume that every ID is decodable on its own. diff --git a/tests/models/phobert/test_tokenization_phobert.py b/tests/models/phobert/test_tokenization_phobert.py index bdf02d5f51a..323355e3cab 100644 --- a/tests/models/phobert/test_tokenization_phobert.py +++ b/tests/models/phobert/test_tokenization_phobert.py @@ -15,10 +15,11 @@ import os import unittest +from functools import lru_cache from transformers.models.phobert.tokenization_phobert import VOCAB_FILES_NAMES, PhobertTokenizer -from ...test_tokenization_common import TokenizerTesterMixin +from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible class PhobertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): @@ -26,27 +27,32 @@ class PhobertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = PhobertTokenizer test_rust_tokenizer = False - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt vocab = ["T@@", "i", "I", "R@@", "r", "e@@"] vocab_tokens = dict(zip(vocab, range(len(vocab)))) merges = ["#version: 0.2", "l à"] - self.special_tokens_map = {"unk_token": ""} + cls.special_tokens_map = {"unk_token": ""} - self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) - self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) + cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) - with open(self.vocab_file, "w", encoding="utf-8") as fp: + with open(cls.vocab_file, "w", encoding="utf-8") as fp: for token in vocab_tokens: fp.write(f"{token} {vocab_tokens[token]}\n") - with open(self.merges_file, "w", encoding="utf-8") as fp: + with open(cls.merges_file, "w", encoding="utf-8") as fp: fp.write("\n".join(merges)) - def get_tokenizer(self, **kwargs): - kwargs.update(self.special_tokens_map) - return PhobertTokenizer.from_pretrained(self.tmpdirname, **kwargs) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_tokenizer(cls, pretrained_name=None, **kwargs): + kwargs.update(cls.special_tokens_map) + pretrained_name = pretrained_name or cls.tmpdirname + return PhobertTokenizer.from_pretrained(pretrained_name, **kwargs) def get_input_output_texts(self, tokenizer): input_text = "Tôi là VinAI Research" diff --git a/tests/models/plbart/test_tokenization_plbart.py b/tests/models/plbart/test_tokenization_plbart.py index ff0ef386e37..1ac7d1a7d17 100644 --- a/tests/models/plbart/test_tokenization_plbart.py +++ b/tests/models/plbart/test_tokenization_plbart.py @@ -45,12 +45,13 @@ class PLBartTokenizationTest(TokenizerTesterMixin, unittest.TestCase): rust_tokenizer_class = None test_rust_tokenizer = False - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() # We have a SentencePiece fixture for testing tokenizer = PLBartTokenizer(SAMPLE_VOCAB, language_codes="base", keep_accents=True) - tokenizer.save_pretrained(self.tmpdirname) + tokenizer.save_pretrained(cls.tmpdirname) def test_full_base_tokenizer(self): tokenizer = PLBartTokenizer(SAMPLE_VOCAB, language_codes="base", keep_accents=True) diff --git a/tests/models/prophetnet/test_tokenization_prophetnet.py b/tests/models/prophetnet/test_tokenization_prophetnet.py index 5eede4d3841..b271c898da1 100644 --- a/tests/models/prophetnet/test_tokenization_prophetnet.py +++ b/tests/models/prophetnet/test_tokenization_prophetnet.py @@ -36,8 +36,9 @@ class ProphetNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = ProphetNetTokenizer test_rust_tokenizer = False - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() vocab_tokens = [ "[UNK]", @@ -56,8 +57,8 @@ class ProphetNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase): "low", "lowest", ] - self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) - with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: + cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer: vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) def get_input_output_texts(self, tokenizer): diff --git a/tests/models/qwen2/test_tokenization_qwen2.py b/tests/models/qwen2/test_tokenization_qwen2.py index b188fd2f8c4..e37ecac9694 100644 --- a/tests/models/qwen2/test_tokenization_qwen2.py +++ b/tests/models/qwen2/test_tokenization_qwen2.py @@ -14,15 +14,17 @@ # limitations under the License. +import copy import json import os import unittest +from functools import lru_cache from transformers import AddedToken, Qwen2Tokenizer, Qwen2TokenizerFast from transformers.models.qwen2.tokenization_qwen2 import VOCAB_FILES_NAMES, bytes_to_unicode from transformers.testing_utils import require_tokenizers, slow -from ...test_tokenization_common import TokenizerTesterMixin +from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible @require_tokenizers @@ -36,8 +38,9 @@ class Qwen2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): from_pretrained_kwargs = None test_seq2seq = False - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() # this make sure the vocabuary is complete at the byte level. vocab = list(bytes_to_unicode().values()) @@ -81,22 +84,34 @@ class Qwen2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): "# #", ] - self.special_tokens_map = {"eos_token": "<|endoftext|>"} + cls.special_tokens_map = {"eos_token": "<|endoftext|>"} - self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) - self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) - with open(self.vocab_file, "w", encoding="utf-8") as fp: + cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) + with open(cls.vocab_file, "w", encoding="utf-8") as fp: fp.write(json.dumps(vocab_tokens) + "\n") - with open(self.merges_file, "w", encoding="utf-8") as fp: + with open(cls.merges_file, "w", encoding="utf-8") as fp: fp.write("\n".join(merges)) - def get_tokenizer(self, **kwargs): - kwargs.update(self.special_tokens_map) - return Qwen2Tokenizer.from_pretrained(self.tmpdirname, **kwargs) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_tokenizer(cls, pretrained_name=None, **kwargs): + _kwargs = copy.deepcopy(cls.special_tokens_map) + _kwargs.update(kwargs) + kwargs = _kwargs + pretrained_name = pretrained_name or cls.tmpdirname + return Qwen2Tokenizer.from_pretrained(pretrained_name, **kwargs) - def get_rust_tokenizer(self, **kwargs): - kwargs.update(self.special_tokens_map) - return Qwen2TokenizerFast.from_pretrained(self.tmpdirname, **kwargs) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_rust_tokenizer(cls, pretrained_name=None, **kwargs): + _kwargs = copy.deepcopy(cls.special_tokens_map) + _kwargs.update(kwargs) + kwargs = _kwargs + pretrained_name = pretrained_name or cls.tmpdirname + return Qwen2TokenizerFast.from_pretrained(pretrained_name, **kwargs) def get_input_output_texts(self, tokenizer): # this case should cover diff --git a/tests/models/reformer/test_tokenization_reformer.py b/tests/models/reformer/test_tokenization_reformer.py index cf7599014c4..d5e3901b3fb 100644 --- a/tests/models/reformer/test_tokenization_reformer.py +++ b/tests/models/reformer/test_tokenization_reformer.py @@ -34,11 +34,12 @@ class ReformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase): test_seq2seq = False test_sentencepiece = True - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() tokenizer = ReformerTokenizer(SAMPLE_VOCAB, keep_accents=True) - tokenizer.save_pretrained(self.tmpdirname) + tokenizer.save_pretrained(cls.tmpdirname) def test_convert_token_and_id(self): """Test ``_convert_token_to_id`` and ``_convert_id_to_token``.""" @@ -84,7 +85,7 @@ class ReformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase): def test_padding(self, max_length=15): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) # Simple input s = "This is a simple input" diff --git a/tests/models/rembert/test_tokenization_rembert.py b/tests/models/rembert/test_tokenization_rembert.py index 113d7b7676a..c2ee3619f8e 100644 --- a/tests/models/rembert/test_tokenization_rembert.py +++ b/tests/models/rembert/test_tokenization_rembert.py @@ -39,11 +39,12 @@ class RemBertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): test_sentencepiece_ignore_case = True pre_trained_model_path = "google/rembert" - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() tokenizer = RemBertTokenizer(SAMPLE_VOCAB) - tokenizer.save_pretrained(self.tmpdirname) + tokenizer.save_pretrained(cls.tmpdirname) # Copied from ReformerTokenizationTest.get_input_output_texts def get_input_output_texts(self, tokenizer): @@ -222,7 +223,7 @@ class RemBertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): with self.subTest("Hub -> Fast: Test loading a fast tokenizer from the hub)"): if self.rust_tokenizer_class is not None: - tokenizer_fast = self.rust_tokenizer_class.from_pretrained(pretrained_name, eos_token=new_eos) + tokenizer_fast = self.get_rust_tokenizer(pretrained_name, eos_token=new_eos) self.assertEqual(tokenizer_fast._special_tokens_map["eos_token"], new_eos) self.assertIn(new_eos, list(tokenizer_fast.added_tokens_decoder.values())) # We can't test the following because for BC we kept the default rstrip lstrip in slow not fast. Will comment once normalization is alright diff --git a/tests/models/roberta/test_tokenization_roberta.py b/tests/models/roberta/test_tokenization_roberta.py index 84fde55e0ae..e2760f646cc 100644 --- a/tests/models/roberta/test_tokenization_roberta.py +++ b/tests/models/roberta/test_tokenization_roberta.py @@ -18,12 +18,13 @@ import itertools import json import os import unittest +from functools import lru_cache from transformers import AddedToken, RobertaTokenizer, RobertaTokenizerFast from transformers.models.roberta.tokenization_roberta import VOCAB_FILES_NAMES from transformers.testing_utils import require_tokenizers, slow -from ...test_tokenization_common import TokenizerTesterMixin +from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible @require_tokenizers @@ -34,8 +35,9 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): test_rust_tokenizer = True from_pretrained_kwargs = {"cls_token": ""} - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt vocab = [ @@ -62,22 +64,30 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ] vocab_tokens = dict(zip(vocab, range(len(vocab)))) merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""] - self.special_tokens_map = {"unk_token": ""} + cls.special_tokens_map = {"unk_token": ""} - self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) - self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) - with open(self.vocab_file, "w", encoding="utf-8") as fp: + cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) + with open(cls.vocab_file, "w", encoding="utf-8") as fp: fp.write(json.dumps(vocab_tokens) + "\n") - with open(self.merges_file, "w", encoding="utf-8") as fp: + with open(cls.merges_file, "w", encoding="utf-8") as fp: fp.write("\n".join(merges)) - def get_tokenizer(self, **kwargs): - kwargs.update(self.special_tokens_map) - return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_tokenizer(cls, pretrained_name=None, **kwargs): + kwargs.update(cls.special_tokens_map) + pretrained_name = pretrained_name or cls.tmpdirname + return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs) - def get_rust_tokenizer(self, **kwargs): - kwargs.update(self.special_tokens_map) - return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_rust_tokenizer(cls, pretrained_name=None, **kwargs): + kwargs.update(cls.special_tokens_map) + pretrained_name = pretrained_name or cls.tmpdirname + return cls.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) def get_input_output_texts(self, tokenizer): input_text = "lower newer" @@ -171,8 +181,8 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): def test_embeded_special_tokens(self): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) sentence = "A, AllenNLP sentence." tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True) tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True) @@ -202,7 +212,7 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): def test_change_add_prefix_space_and_trim_offsets_args(self): for trim_offsets, add_prefix_space in itertools.product([True, False], repeat=2): - tokenizer_r = self.rust_tokenizer_class.from_pretrained( + tokenizer_r = self.get_rust_tokenizer( self.tmpdirname, use_fast=True, add_prefix_space=add_prefix_space, trim_offsets=trim_offsets ) @@ -222,7 +232,7 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): text_of_1_token = "hello" # `hello` is a token in the vocabulary of `pretrained_name` text = f"{text_of_1_token} {text_of_1_token}" - tokenizer_r = self.rust_tokenizer_class.from_pretrained( + tokenizer_r = self.get_rust_tokenizer( pretrained_name, use_fast=True, add_prefix_space=True, trim_offsets=True ) encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False) @@ -232,7 +242,7 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): (len(text_of_1_token) + 1, len(text_of_1_token) + 1 + len(text_of_1_token)), ) - tokenizer_r = self.rust_tokenizer_class.from_pretrained( + tokenizer_r = self.get_rust_tokenizer( pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=True ) encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False) @@ -242,7 +252,7 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): (len(text_of_1_token) + 1, len(text_of_1_token) + 1 + len(text_of_1_token)), ) - tokenizer_r = self.rust_tokenizer_class.from_pretrained( + tokenizer_r = self.get_rust_tokenizer( pretrained_name, use_fast=True, add_prefix_space=True, trim_offsets=False ) encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False) @@ -252,7 +262,7 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): (len(text_of_1_token), len(text_of_1_token) + 1 + len(text_of_1_token)), ) - tokenizer_r = self.rust_tokenizer_class.from_pretrained( + tokenizer_r = self.get_rust_tokenizer( pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=False ) encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False) @@ -274,7 +284,7 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): # (1 + len(text_of_1_token) + 1, 1 + len(text_of_1_token) + 1 + len(text_of_1_token)), # ) - tokenizer_r = self.rust_tokenizer_class.from_pretrained( + tokenizer_r = self.get_rust_tokenizer( pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=True ) encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False) @@ -284,7 +294,7 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): (1 + len(text_of_1_token) + 1, 1 + len(text_of_1_token) + 1 + len(text_of_1_token)), ) - tokenizer_r = self.rust_tokenizer_class.from_pretrained( + tokenizer_r = self.get_rust_tokenizer( pretrained_name, use_fast=True, add_prefix_space=True, trim_offsets=False ) encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False) @@ -294,7 +304,7 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): (1 + len(text_of_1_token), 1 + len(text_of_1_token) + 1 + len(text_of_1_token)), ) - tokenizer_r = self.rust_tokenizer_class.from_pretrained( + tokenizer_r = self.get_rust_tokenizer( pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=False ) encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False) diff --git a/tests/models/roc_bert/test_tokenization_roc_bert.py b/tests/models/roc_bert/test_tokenization_roc_bert.py index fdd95a033aa..885975b8df3 100644 --- a/tests/models/roc_bert/test_tokenization_roc_bert.py +++ b/tests/models/roc_bert/test_tokenization_roc_bert.py @@ -41,8 +41,9 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): space_between_special_tokens = True from_pretrained_filter = filter_non_english - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]", "你", "好", "是", "谁", "a", "b", "c", "d"] word_shape = {} @@ -50,14 +51,14 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): for i, value in enumerate(vocab_tokens): word_shape[value] = i word_pronunciation[value] = i - self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) - self.word_shape_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["word_shape_file"]) - self.word_pronunciation_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["word_pronunciation_file"]) - with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: + cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + cls.word_shape_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["word_shape_file"]) + cls.word_pronunciation_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["word_pronunciation_file"]) + with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer: vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) - with open(self.word_shape_file, "w", encoding="utf-8") as word_shape_writer: + with open(cls.word_shape_file, "w", encoding="utf-8") as word_shape_writer: json.dump(word_shape, word_shape_writer, ensure_ascii=False) - with open(self.word_pronunciation_file, "w", encoding="utf-8") as word_pronunciation_writer: + with open(cls.word_pronunciation_file, "w", encoding="utf-8") as word_pronunciation_writer: json.dump(word_pronunciation, word_pronunciation_writer, ensure_ascii=False) def test_full_tokenizer(self): @@ -204,7 +205,7 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): def test_offsets_with_special_characters(self): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence." tokens = tokenizer_r.encode_plus( @@ -260,8 +261,8 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): kwargs["tokenize_chinese_chars"] = True - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False) ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False) @@ -274,8 +275,8 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): self.assertListEqual(tokens_without_spe_char_r, list_of_commun_chinese_char) kwargs["tokenize_chinese_chars"] = False - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False) ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False) diff --git a/tests/models/roformer/test_tokenization_roformer.py b/tests/models/roformer/test_tokenization_roformer.py index 6dfd0a385f0..e6db4f0d09b 100644 --- a/tests/models/roformer/test_tokenization_roformer.py +++ b/tests/models/roformer/test_tokenization_roformer.py @@ -15,11 +15,12 @@ import tempfile import unittest +from functools import lru_cache from transformers import RoFormerTokenizer, RoFormerTokenizerFast from transformers.testing_utils import require_rjieba, require_tokenizers -from ...test_tokenization_common import TokenizerTesterMixin +from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible @require_rjieba @@ -31,14 +32,25 @@ class RoFormerTokenizationTest(TokenizerTesterMixin, unittest.TestCase): space_between_special_tokens = True test_rust_tokenizer = True - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() + tokenizer = cls.tokenizer_class.from_pretrained("junnyu/roformer_chinese_base") + tokenizer.save_pretrained(cls.tmpdirname) - def get_tokenizer(self, **kwargs): - return self.tokenizer_class.from_pretrained("junnyu/roformer_chinese_base", **kwargs) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_tokenizer(cls, pretrained_name=None, **kwargs): + pretrained_name = pretrained_name or cls.tmpdirname + return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs) - def get_rust_tokenizer(self, **kwargs): - return self.rust_tokenizer_class.from_pretrained("junnyu/roformer_chinese_base", **kwargs) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_rust_tokenizer(cls, pretrained_name=None, **kwargs): + pretrained_name = pretrained_name or cls.tmpdirname + return cls.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) def get_chinese_input_output_texts(self): input_text = "永和服装饰品有限公司,今天天气非常好" diff --git a/tests/models/seamless_m4t/test_tokenization_seamless_m4t.py b/tests/models/seamless_m4t/test_tokenization_seamless_m4t.py index efb8d87cac1..f55be02e172 100644 --- a/tests/models/seamless_m4t/test_tokenization_seamless_m4t.py +++ b/tests/models/seamless_m4t/test_tokenization_seamless_m4t.py @@ -59,12 +59,13 @@ class SeamlessM4TTokenizationTest(TokenizerTesterMixin, unittest.TestCase): test_sentencepiece = True from_pretrained_kwargs = {} - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() # We have a SentencePiece fixture for testing tokenizer = SeamlessM4TTokenizer(SAMPLE_VOCAB, keep_accents=True) - tokenizer.save_pretrained(self.tmpdirname) + tokenizer.save_pretrained(cls.tmpdirname) def test_full_tokenizer(self): tokenizer = SeamlessM4TTokenizer(SAMPLE_VOCAB, keep_accents=True) @@ -353,7 +354,7 @@ class SeamlessM4TTokenizationTest(TokenizerTesterMixin, unittest.TestCase): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): added_tokens = [AddedToken("", lstrip=True)] - tokenizer_r = self.rust_tokenizer_class.from_pretrained( + tokenizer_r = self.get_rust_tokenizer( pretrained_name, additional_special_tokens=added_tokens, **kwargs ) r_output = tokenizer_r.encode("Hey this is a token") @@ -363,7 +364,7 @@ class SeamlessM4TTokenizationTest(TokenizerTesterMixin, unittest.TestCase): self.assertTrue(special_token_id in r_output) if self.test_slow_tokenizer: - tokenizer_cr = self.rust_tokenizer_class.from_pretrained( + tokenizer_cr = self.get_rust_tokenizer( pretrained_name, additional_special_tokens=added_tokens, **kwargs, # , from_slow=True <- unfortunately too slow to convert diff --git a/tests/models/siglip/test_tokenization_siglip.py b/tests/models/siglip/test_tokenization_siglip.py index b44451f6f20..f4bc56c5e33 100644 --- a/tests/models/siglip/test_tokenization_siglip.py +++ b/tests/models/siglip/test_tokenization_siglip.py @@ -17,12 +17,13 @@ import json import os import tempfile import unittest +from functools import lru_cache from transformers import SPIECE_UNDERLINE, AddedToken, BatchEncoding, SiglipTokenizer from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, slow from transformers.utils import cached_property, is_tf_available, is_torch_available -from ...test_tokenization_common import TokenizerTesterMixin +from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") @@ -44,13 +45,13 @@ class SiglipTokenizationTest(TokenizerTesterMixin, unittest.TestCase): test_sentencepiece = True test_sentencepiece_ignore_case = True - # Copied from tests.models.t5.test_tokenization_t5.T5TokenizationTest.setUp with T5->Siglip - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() # We have a SentencePiece fixture for testing tokenizer = SiglipTokenizer(SAMPLE_VOCAB) - tokenizer.save_pretrained(self.tmpdirname) + tokenizer.save_pretrained(cls.tmpdirname) # Copied from tests.models.t5.test_tokenization_t5.T5TokenizationTest.test_convert_token_and_id with T5->Siglip def test_convert_token_and_id(self): @@ -135,9 +136,12 @@ class SiglipTokenizationTest(TokenizerTesterMixin, unittest.TestCase): def siglip_tokenizer(self): return SiglipTokenizer.from_pretrained("google/siglip-base-patch16-224") - # Copied from tests.models.t5.test_tokenization_t5.T5TokenizationTest.get_tokenizer with T5->Siglip - def get_tokenizer(self, **kwargs) -> SiglipTokenizer: - return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_tokenizer(cls, pretrained_name=None, **kwargs) -> SiglipTokenizer: + pretrained_name = pretrained_name or cls.tmpdirname + return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs) # Copied from tests.models.t5.test_tokenization_t5.T5TokenizationTest.test_rust_and_python_full_tokenizers with T5->Siglip def test_rust_and_python_full_tokenizers(self): @@ -227,10 +231,10 @@ class SiglipTokenizationTest(TokenizerTesterMixin, unittest.TestCase): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): added_tokens = [f"" for i in range(100)] + [AddedToken("", lstrip=True)] - tokenizer_r = self.rust_tokenizer_class.from_pretrained( + tokenizer_r = self.get_rust_tokenizer( pretrained_name, additional_special_tokens=added_tokens, **kwargs ) - tokenizer_cr = self.rust_tokenizer_class.from_pretrained( + tokenizer_cr = self.get_rust_tokenizer( pretrained_name, additional_special_tokens=added_tokens, **kwargs, from_slow=True ) tokenizer_p = self.tokenizer_class.from_pretrained( diff --git a/tests/models/speech_to_text/test_tokenization_speech_to_text.py b/tests/models/speech_to_text/test_tokenization_speech_to_text.py index 6bea58ddfcf..3fc2926b62e 100644 --- a/tests/models/speech_to_text/test_tokenization_speech_to_text.py +++ b/tests/models/speech_to_text/test_tokenization_speech_to_text.py @@ -42,8 +42,9 @@ class SpeechToTextTokenizerTest(TokenizerTesterMixin, unittest.TestCase): test_rust_tokenizer = False test_sentencepiece = True - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() spm_model = sp.SentencePieceProcessor() spm_model.Load(SAMPLE_VOCAB) @@ -52,13 +53,13 @@ class SpeechToTextTokenizerTest(TokenizerTesterMixin, unittest.TestCase): vocab += [spm_model.IdToPiece(id_) for id_ in range(len(spm_model))] vocab_tokens = dict(zip(vocab, range(len(vocab)))) - save_dir = Path(self.tmpdirname) + save_dir = Path(cls.tmpdirname) save_json(vocab_tokens, save_dir / VOCAB_FILES_NAMES["vocab_file"]) if not (save_dir / VOCAB_FILES_NAMES["spm_file"]).exists(): copyfile(SAMPLE_VOCAB, save_dir / VOCAB_FILES_NAMES["spm_file"]) - tokenizer = Speech2TextTokenizer.from_pretrained(self.tmpdirname) - tokenizer.save_pretrained(self.tmpdirname) + tokenizer = Speech2TextTokenizer.from_pretrained(cls.tmpdirname) + tokenizer.save_pretrained(cls.tmpdirname) def test_convert_token_and_id(self): """Test ``_convert_token_to_id`` and ``_convert_id_to_token``.""" diff --git a/tests/models/speecht5/test_tokenization_speecht5.py b/tests/models/speecht5/test_tokenization_speecht5.py index 8b53031f524..026fd1d2f48 100644 --- a/tests/models/speecht5/test_tokenization_speecht5.py +++ b/tests/models/speecht5/test_tokenization_speecht5.py @@ -35,8 +35,9 @@ class SpeechT5TokenizerTest(TokenizerTesterMixin, unittest.TestCase): test_rust_tokenizer = False test_sentencepiece = True - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() # We have a SentencePiece fixture for testing tokenizer = SpeechT5Tokenizer(SAMPLE_VOCAB) @@ -46,7 +47,7 @@ class SpeechT5TokenizerTest(TokenizerTesterMixin, unittest.TestCase): tokenizer.add_special_tokens({"mask_token": mask_token}) tokenizer.add_tokens([""]) - tokenizer.save_pretrained(self.tmpdirname) + tokenizer.save_pretrained(cls.tmpdirname) def get_input_output_texts(self, tokenizer): input_text = "this is a test" diff --git a/tests/models/splinter/test_tokenization_splinter.py b/tests/models/splinter/test_tokenization_splinter.py index 4c6d295e8a8..f654e2e5b73 100644 --- a/tests/models/splinter/test_tokenization_splinter.py +++ b/tests/models/splinter/test_tokenization_splinter.py @@ -13,8 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. import unittest +from functools import lru_cache -from tests.test_tokenization_common import TokenizerTesterMixin +from tests.test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible from transformers import SplinterTokenizerFast, is_tf_available, is_torch_available from transformers.models.splinter import SplinterTokenizer from transformers.testing_utils import get_tests_dir, slow @@ -40,20 +41,29 @@ class SplinterTokenizationTest(TokenizerTesterMixin, unittest.TestCase): pre_trained_model_path = "tau/splinter-base" # Copied from transformers.models.siglip.SiglipTokenizationTest.setUp - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() tokenizer = SplinterTokenizer(SAMPLE_VOCAB) tokenizer.vocab["[UNK]"] = len(tokenizer.vocab) tokenizer.vocab["[QUESTION]"] = len(tokenizer.vocab) tokenizer.vocab["."] = len(tokenizer.vocab) tokenizer.add_tokens("this is a test thou shall not determine rigor truly".split()) - tokenizer.save_pretrained(self.tmpdirname) + tokenizer.save_pretrained(cls.tmpdirname) - def get_tokenizer(self, **kwargs) -> SplinterTokenizer: - return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_tokenizer(cls, pretrained_name=None, **kwargs) -> SplinterTokenizer: + pretrained_name = pretrained_name or cls.tmpdirname + return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs) - def get_rust_tokenizer(self, **kwargs) -> SplinterTokenizerFast: - return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_rust_tokenizer(cls, pretrained_name=None, **kwargs) -> SplinterTokenizerFast: + pretrained_name = pretrained_name or cls.tmpdirname + return cls.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) # Copied from transformers.models.siglip.SiglipTokenizationTest.test_get_vocab def test_get_vocab(self): diff --git a/tests/models/squeezebert/test_tokenization_squeezebert.py b/tests/models/squeezebert/test_tokenization_squeezebert.py index 3ac24e8374b..0a75e768ccd 100644 --- a/tests/models/squeezebert/test_tokenization_squeezebert.py +++ b/tests/models/squeezebert/test_tokenization_squeezebert.py @@ -13,22 +13,31 @@ # See the License for the specific language governing permissions and # limitations under the License. +from functools import lru_cache from transformers import SqueezeBertTokenizer, SqueezeBertTokenizerFast from transformers.testing_utils import require_tokenizers, slow -from ..bert.test_tokenization_bert import BertTokenizationTest +from ...test_tokenization_common import use_cache_if_possible + +# Avoid import `BertTokenizationTest` directly as it will run as `test_tokenization_squeezebert.py::BertTokenizationTest` +# together with `test_tokenization_bert.py::BertTokenizationTest`. +from ..bert import test_tokenization_bert @require_tokenizers -class SqueezeBertTokenizationTest(BertTokenizationTest): +class SqueezeBertTokenizationTest(test_tokenization_bert.BertTokenizationTest): tokenizer_class = SqueezeBertTokenizer rust_tokenizer_class = SqueezeBertTokenizerFast test_rust_tokenizer = True from_pretrained_id = "squeezebert/squeezebert-uncased" - def get_rust_tokenizer(self, **kwargs): - return SqueezeBertTokenizerFast.from_pretrained(self.tmpdirname, **kwargs) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_rust_tokenizer(cls, pretrained_name=None, **kwargs): + pretrained_name = pretrained_name or cls.tmpdirname + return SqueezeBertTokenizerFast.from_pretrained(pretrained_name, **kwargs) @slow def test_sequence_builders(self): diff --git a/tests/models/t5/test_tokenization_t5.py b/tests/models/t5/test_tokenization_t5.py index a7ad5320af2..aba5dde8cb9 100644 --- a/tests/models/t5/test_tokenization_t5.py +++ b/tests/models/t5/test_tokenization_t5.py @@ -17,12 +17,13 @@ import os import re import tempfile import unittest +from functools import lru_cache from transformers import SPIECE_UNDERLINE, AddedToken, BatchEncoding, T5Tokenizer, T5TokenizerFast from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_seqio, require_tokenizers, slow from transformers.utils import cached_property, is_tf_available, is_torch_available -from ...test_tokenization_common import TokenizerTesterMixin +from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") @@ -44,12 +45,13 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase): test_rust_tokenizer = True test_sentencepiece = True - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() # We have a SentencePiece fixture for testing tokenizer = T5Tokenizer(SAMPLE_VOCAB) - tokenizer.save_pretrained(self.tmpdirname) + tokenizer.save_pretrained(cls.tmpdirname) def test_convert_token_and_id(self): """Test ``_convert_token_to_id`` and ``_convert_id_to_token``.""" @@ -145,11 +147,19 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase): def t5_base_tokenizer_fast(self): return T5TokenizerFast.from_pretrained("google-t5/t5-base") - def get_tokenizer(self, **kwargs) -> T5Tokenizer: - return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_tokenizer(cls, pretrained_name=None, **kwargs) -> T5Tokenizer: + pretrained_name = pretrained_name or cls.tmpdirname + return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs) - def get_rust_tokenizer(self, **kwargs) -> T5TokenizerFast: - return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_rust_tokenizer(cls, pretrained_name=None, **kwargs) -> T5TokenizerFast: + pretrained_name = pretrained_name or cls.tmpdirname + return cls.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) def test_rust_and_python_full_tokenizers(self): if not self.test_rust_tokenizer: @@ -275,10 +285,10 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): added_tokens = [f"" for i in range(100)] + [AddedToken("", lstrip=True)] - tokenizer_r = self.rust_tokenizer_class.from_pretrained( + tokenizer_r = self.get_rust_tokenizer( pretrained_name, additional_special_tokens=added_tokens, **kwargs ) - tokenizer_cr = self.rust_tokenizer_class.from_pretrained( + tokenizer_cr = self.get_rust_tokenizer( pretrained_name, additional_special_tokens=added_tokens, **kwargs, from_slow=True ) tokenizer_p = self.tokenizer_class.from_pretrained( @@ -460,10 +470,8 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase): EXPECTED_WITH_SPACE = [9459, 149, 33, 25, 692, 1] EXPECTED_WO_SPACE = [3845, 63, 149, 33, 25, 692, 1] - slow_ = self.tokenizer_class.from_pretrained(pretrained_name, add_prefix_space=False, legacy=False) - fast_ = self.rust_tokenizer_class.from_pretrained( - pretrained_name, add_prefix_space=False, legacy=False, from_slow=True - ) + slow_ = self.get_tokenizer(pretrained_name, add_prefix_space=False, legacy=False) + fast_ = self.get_rust_tokenizer(pretrained_name, add_prefix_space=False, legacy=False, from_slow=True) self.assertEqual(slow_.encode(inputs), EXPECTED_WO_SPACE) self.assertEqual(slow_.encode(inputs), fast_.encode(inputs)) self.assertEqual(slow_.tokenize(inputs), ["He", "y", "▁how", "▁are", "▁you", "▁doing"]) @@ -473,8 +481,8 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase): fast_.decode(EXPECTED_WO_SPACE, skip_special_tokens=True), ) - slow_ = self.tokenizer_class.from_pretrained(pretrained_name, add_prefix_space=True, legacy=False) - fast_ = self.rust_tokenizer_class.from_pretrained(pretrained_name, add_prefix_space=True, legacy=False) + slow_ = self.get_tokenizer(pretrained_name, add_prefix_space=True, legacy=False) + fast_ = self.get_rust_tokenizer(pretrained_name, add_prefix_space=True, legacy=False) self.assertEqual(slow_.encode(inputs), EXPECTED_WITH_SPACE) self.assertEqual(slow_.encode(inputs), fast_.encode(inputs)) self.assertEqual(slow_.tokenize(inputs), ["▁Hey", "▁how", "▁are", "▁you", "▁doing"]) diff --git a/tests/models/tapas/test_tokenization_tapas.py b/tests/models/tapas/test_tokenization_tapas.py index 20d56b4ed4c..f50e9eb8678 100644 --- a/tests/models/tapas/test_tokenization_tapas.py +++ b/tests/models/tapas/test_tokenization_tapas.py @@ -112,8 +112,9 @@ class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase): return output_txt, output_ids - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() vocab_tokens = [ "[UNK]", @@ -132,8 +133,8 @@ class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase): "low", "lowest", ] - self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) - with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: + cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer: vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) def get_input_output_texts(self, tokenizer): @@ -352,7 +353,7 @@ class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase): def test_offsets_with_special_characters(self): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence." tokens = tokenizer_r.encode_plus( diff --git a/tests/models/udop/test_tokenization_udop.py b/tests/models/udop/test_tokenization_udop.py index 470a126b892..c8490c4cc0e 100644 --- a/tests/models/udop/test_tokenization_udop.py +++ b/tests/models/udop/test_tokenization_udop.py @@ -93,12 +93,13 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase): return questions, words, boxes - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() # We have a SentencePiece fixture for testing tokenizer = UdopTokenizer(SAMPLE_VOCAB, keep_accents=True) - tokenizer.save_pretrained(self.tmpdirname) + tokenizer.save_pretrained(cls.tmpdirname) def get_input_output_texts(self, tokenizer): input_text = "UNwant\u00e9d,running" @@ -456,8 +457,8 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase): def test_padding(self, max_length=50): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id) pad_token_id = tokenizer_p.pad_token_id @@ -922,8 +923,8 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) # Input tokens id words, boxes = self.get_words_and_boxes() @@ -1109,7 +1110,7 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase): def test_offsets_mapping(self): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) text = ["a", "wonderful", "test"] boxes = [[1, 8, 12, 20] for _ in range(len(text))] @@ -1239,8 +1240,8 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) words, boxes = self.get_words_and_boxes() @@ -1293,8 +1294,8 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) words, boxes = self.get_words_and_boxes() tokens_r = tokenizer_r.encode_plus_boxes( words, @@ -1320,7 +1321,7 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase): def test_compare_add_special_tokens(self): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) simple_num_special_tokens_to_add = tokenizer_r.num_special_tokens_to_add(pair=False) @@ -1402,7 +1403,7 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase): with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): added_tokens = [AddedToken("", lstrip=True)] - tokenizer_r = self.rust_tokenizer_class.from_pretrained( + tokenizer_r = self.get_rust_tokenizer( pretrained_name, additional_special_tokens=added_tokens, **kwargs ) words = "Hey this is a token".split() @@ -1416,7 +1417,7 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase): self.assertTrue(special_token_id in r_output) if self.test_slow_tokenizer: - tokenizer_cr = self.rust_tokenizer_class.from_pretrained( + tokenizer_cr = self.get_rust_tokenizer( pretrained_name, additional_special_tokens=added_tokens, **kwargs, from_slow=True ) tokenizer_p = self.tokenizer_class.from_pretrained( @@ -1591,8 +1592,8 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id) pad_token_id = tokenizer_p.pad_token_id diff --git a/tests/models/vits/test_tokenization_vits.py b/tests/models/vits/test_tokenization_vits.py index f4a9c8a7438..98b02ca5fd8 100644 --- a/tests/models/vits/test_tokenization_vits.py +++ b/tests/models/vits/test_tokenization_vits.py @@ -19,12 +19,13 @@ import os import shutil import tempfile import unittest +from functools import lru_cache from transformers import VitsTokenizer from transformers.models.vits.tokenization_vits import VOCAB_FILES_NAMES from transformers.testing_utils import slow -from ...test_tokenization_common import TokenizerTesterMixin +from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible class VitsTokenizerTest(TokenizerTesterMixin, unittest.TestCase): @@ -32,8 +33,9 @@ class VitsTokenizerTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = VitsTokenizer test_rust_tokenizer = False - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() vocab = ( "k ' z y u d h e s w – 3 c p - 1 j m i X f l o 0 b r a 4 2 n _ x v t q 5 6 g ț ţ < > | ".split( @@ -44,18 +46,22 @@ class VitsTokenizerTest(TokenizerTesterMixin, unittest.TestCase): vocab_tokens[" "] = vocab_tokens["X"] del vocab_tokens["X"] - self.special_tokens_map = {"pad_token": "", "unk_token": ""} + cls.special_tokens_map = {"pad_token": "", "unk_token": ""} - self.tmpdirname = tempfile.mkdtemp() - self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) - with open(self.vocab_file, "w", encoding="utf-8") as fp: + cls.tmpdirname = tempfile.mkdtemp() + cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + with open(cls.vocab_file, "w", encoding="utf-8") as fp: fp.write(json.dumps(vocab_tokens) + "\n") - def get_tokenizer(self, **kwargs): - kwargs.update(self.special_tokens_map) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_tokenizer(cls, pretrained_name=None, **kwargs): + kwargs.update(cls.special_tokens_map) kwargs["phonemize"] = False kwargs["normalize"] = False - return VitsTokenizer.from_pretrained(self.tmpdirname, **kwargs) + pretrained_name = pretrained_name or cls.tmpdirname + return VitsTokenizer.from_pretrained(pretrained_name, **kwargs) def get_clean_sequence(self, tokenizer, with_prefix_space=False, max_length=20, min_length=5): txt = "beyonce lives in los angeles" diff --git a/tests/models/wav2vec2/test_tokenization_wav2vec2.py b/tests/models/wav2vec2/test_tokenization_wav2vec2.py index 4a4058891d3..5a3906811fe 100644 --- a/tests/models/wav2vec2/test_tokenization_wav2vec2.py +++ b/tests/models/wav2vec2/test_tokenization_wav2vec2.py @@ -21,6 +21,7 @@ import random import shutil import tempfile import unittest +from functools import lru_cache import numpy as np @@ -33,7 +34,7 @@ from transformers import ( from transformers.models.wav2vec2.tokenization_wav2vec2 import VOCAB_FILES_NAMES, Wav2Vec2CTCTokenizerOutput from transformers.testing_utils import require_torch, slow -from ...test_tokenization_common import TokenizerTesterMixin +from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible global_rng = random.Random() @@ -57,22 +58,27 @@ def floats_list(shape, scale=1.0, rng=None, name=None): class Wav2Vec2TokenizerTest(unittest.TestCase): tokenizer_class = Wav2Vec2Tokenizer - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() vocab = " | E T A O N I H S R D L U M W C F G Y P B V K ' X J Q Z".split(" ") vocab_tokens = dict(zip(vocab, range(len(vocab)))) - self.special_tokens_map = {"pad_token": "", "unk_token": "", "bos_token": "", "eos_token": ""} + cls.special_tokens_map = {"pad_token": "", "unk_token": "", "bos_token": "", "eos_token": ""} - self.tmpdirname = tempfile.mkdtemp() - self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) - with open(self.vocab_file, "w", encoding="utf-8") as fp: + cls.tmpdirname = tempfile.mkdtemp() + cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + with open(cls.vocab_file, "w", encoding="utf-8") as fp: fp.write(json.dumps(vocab_tokens) + "\n") - def get_tokenizer(self, **kwargs): - kwargs.update(self.special_tokens_map) - return Wav2Vec2Tokenizer.from_pretrained(self.tmpdirname, **kwargs) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_tokenizer(cls, pretrained_name=None, **kwargs): + kwargs.update(cls.special_tokens_map) + pretrained_name = pretrained_name or cls.tmpdirname + return Wav2Vec2Tokenizer.from_pretrained(pretrained_name, **kwargs) def test_tokenizer_decode(self): # TODO(PVP) - change to facebook @@ -237,7 +243,7 @@ class Wav2Vec2TokenizerTest(unittest.TestCase): def test_save_pretrained(self): pretrained_name = list(self.tokenizer_class.pretrained_vocab_files_map["vocab_file"].keys())[0] - tokenizer = self.tokenizer_class.from_pretrained(pretrained_name) + tokenizer = self.get_tokenizer(pretrained_name) tmpdirname2 = tempfile.mkdtemp() tokenizer_files = tokenizer.save_pretrained(tmpdirname2) @@ -373,22 +379,27 @@ class Wav2Vec2CTCTokenizerTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = Wav2Vec2CTCTokenizer test_rust_tokenizer = False - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() vocab = " | E T A O N I H S R D L U M W C F G Y P B V K ' X J Q Z".split(" ") vocab_tokens = dict(zip(vocab, range(len(vocab)))) - self.special_tokens_map = {"pad_token": "", "unk_token": "", "bos_token": "", "eos_token": ""} + cls.special_tokens_map = {"pad_token": "", "unk_token": "", "bos_token": "", "eos_token": ""} - self.tmpdirname = tempfile.mkdtemp() - self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) - with open(self.vocab_file, "w", encoding="utf-8") as fp: + cls.tmpdirname = tempfile.mkdtemp() + cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + with open(cls.vocab_file, "w", encoding="utf-8") as fp: fp.write(json.dumps(vocab_tokens) + "\n") - def get_tokenizer(self, **kwargs): - kwargs.update(self.special_tokens_map) - return Wav2Vec2CTCTokenizer.from_pretrained(self.tmpdirname, **kwargs) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_tokenizer(cls, pretrained_name=None, **kwargs): + kwargs.update(cls.special_tokens_map) + pretrained_name = pretrained_name or cls.tmpdirname + return Wav2Vec2CTCTokenizer.from_pretrained(pretrained_name, **kwargs) def test_tokenizer_add_token_chars(self): tokenizer = self.tokenizer_class.from_pretrained("facebook/wav2vec2-base-960h") diff --git a/tests/models/wav2vec2_phoneme/test_tokenization_wav2vec2_phoneme.py b/tests/models/wav2vec2_phoneme/test_tokenization_wav2vec2_phoneme.py index 96bed25ad16..f9d547acddf 100644 --- a/tests/models/wav2vec2_phoneme/test_tokenization_wav2vec2_phoneme.py +++ b/tests/models/wav2vec2_phoneme/test_tokenization_wav2vec2_phoneme.py @@ -17,6 +17,7 @@ import json import os import unittest +from functools import lru_cache from typing import Tuple from transformers import Wav2Vec2PhonemeCTCTokenizer @@ -24,7 +25,7 @@ from transformers.models.wav2vec2.tokenization_wav2vec2 import VOCAB_FILES_NAMES from transformers.models.wav2vec2_phoneme.tokenization_wav2vec2_phoneme import Wav2Vec2PhonemeCTCTokenizerOutput from transformers.testing_utils import require_phonemizer -from ...test_tokenization_common import TokenizerTesterMixin +from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible @require_phonemizer @@ -33,8 +34,9 @@ class Wav2Vec2PhonemeCTCTokenizerTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = Wav2Vec2PhonemeCTCTokenizer test_rust_tokenizer = False - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() vocab = ( " n s t ə l a i k d m ɛ ɾ e ɪ p o ɐ z ð f j v b ɹ ʁ ʊ iː r w ʌ u ɡ æ aɪ ʃ h ɔ ɑː " @@ -53,10 +55,10 @@ class Wav2Vec2PhonemeCTCTokenizerTest(TokenizerTesterMixin, unittest.TestCase): ).split(" ") vocab_tokens = dict(zip(vocab, range(len(vocab)))) - self.special_tokens_map = {"pad_token": "", "unk_token": "", "bos_token": "", "eos_token": ""} + cls.special_tokens_map = {"pad_token": "", "unk_token": "", "bos_token": "", "eos_token": ""} - self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) - with open(self.vocab_file, "w", encoding="utf-8") as fp: + cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + with open(cls.vocab_file, "w", encoding="utf-8") as fp: fp.write(json.dumps(vocab_tokens) + "\n") # overwrite since phonemes require specific creation @@ -84,9 +86,13 @@ class Wav2Vec2PhonemeCTCTokenizerTest(TokenizerTesterMixin, unittest.TestCase): output_ids = tokenizer.encode(output_txt, add_special_tokens=False) return output_txt, output_ids - def get_tokenizer(self, **kwargs): - kwargs.update(self.special_tokens_map) - return Wav2Vec2PhonemeCTCTokenizer.from_pretrained(self.tmpdirname, **kwargs) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_tokenizer(cls, pretrained_name=None, **kwargs): + kwargs.update(cls.special_tokens_map) + pretrained_name = pretrained_name or cls.tmpdirname + return Wav2Vec2PhonemeCTCTokenizer.from_pretrained(pretrained_name, **kwargs) def test_tokenizer_add_new_tokens(self): tokenizer = self.tokenizer_class.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft") diff --git a/tests/models/whisper/test_tokenization_whisper.py b/tests/models/whisper/test_tokenization_whisper.py index 27b24448d5a..61a34c165d8 100644 --- a/tests/models/whisper/test_tokenization_whisper.py +++ b/tests/models/whisper/test_tokenization_whisper.py @@ -40,12 +40,13 @@ class WhisperTokenizerTest(TokenizerTesterMixin, unittest.TestCase): test_sentencepiece = False test_seq2seq = False - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny") tokenizer.pad_token_id = 50256 tokenizer.pad_token = "<|endoftext|>" - tokenizer.save_pretrained(self.tmpdirname) + tokenizer.save_pretrained(cls.tmpdirname) def test_convert_token_and_id(self): """Test ``_convert_token_to_id`` and ``_convert_id_to_token``.""" diff --git a/tests/models/xglm/test_tokenization_xglm.py b/tests/models/xglm/test_tokenization_xglm.py index eac3eda05da..08fa3ebf1a3 100644 --- a/tests/models/xglm/test_tokenization_xglm.py +++ b/tests/models/xglm/test_tokenization_xglm.py @@ -37,12 +37,13 @@ class XGLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): test_rust_tokenizer = True test_sentencepiece = True - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() # We have a SentencePiece fixture for testing tokenizer = XGLMTokenizer(SAMPLE_VOCAB, keep_accents=True) - tokenizer.save_pretrained(self.tmpdirname) + tokenizer.save_pretrained(cls.tmpdirname) def test_convert_token_and_id(self): """Test ``_convert_token_to_id`` and ``_convert_id_to_token``.""" diff --git a/tests/models/xlm/test_tokenization_xlm.py b/tests/models/xlm/test_tokenization_xlm.py index 6bc7fedad48..2292b18b8bd 100644 --- a/tests/models/xlm/test_tokenization_xlm.py +++ b/tests/models/xlm/test_tokenization_xlm.py @@ -29,8 +29,9 @@ class XLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = XLMTokenizer test_rust_tokenizer = False - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt vocab = [ @@ -59,11 +60,11 @@ class XLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): vocab_tokens = dict(zip(vocab, range(len(vocab)))) merges = ["l o 123", "lo w 1456", "e r 1789", ""] - self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) - self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) - with open(self.vocab_file, "w") as fp: + cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"]) + with open(cls.vocab_file, "w") as fp: fp.write(json.dumps(vocab_tokens)) - with open(self.merges_file, "w") as fp: + with open(cls.merges_file, "w") as fp: fp.write("\n".join(merges)) def get_input_output_texts(self, tokenizer): diff --git a/tests/models/xlm_roberta/test_tokenization_xlm_roberta.py b/tests/models/xlm_roberta/test_tokenization_xlm_roberta.py index ae32a62a939..ba5a834bfe0 100644 --- a/tests/models/xlm_roberta/test_tokenization_xlm_roberta.py +++ b/tests/models/xlm_roberta/test_tokenization_xlm_roberta.py @@ -37,12 +37,13 @@ class XLMRobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): test_rust_tokenizer = True test_sentencepiece = True - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() # We have a SentencePiece fixture for testing tokenizer = XLMRobertaTokenizer(SAMPLE_VOCAB, keep_accents=True) - tokenizer.save_pretrained(self.tmpdirname) + tokenizer.save_pretrained(cls.tmpdirname) def test_convert_token_and_id(self): """Test ``_convert_token_to_id`` and ``_convert_id_to_token``.""" @@ -148,8 +149,8 @@ class XLMRobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): self.tokenizers_list[0] = (self.rust_tokenizer_class, "hf-internal-testing/tiny-xlm-roberta", {}) for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) tmpdirname2 = tempfile.mkdtemp() diff --git a/tests/models/xlnet/test_tokenization_xlnet.py b/tests/models/xlnet/test_tokenization_xlnet.py index 32dd4685c8c..307499b605d 100644 --- a/tests/models/xlnet/test_tokenization_xlnet.py +++ b/tests/models/xlnet/test_tokenization_xlnet.py @@ -33,12 +33,13 @@ class XLNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase): test_rust_tokenizer = True test_sentencepiece = True - def setUp(self): - super().setUp() + @classmethod + def setUpClass(cls): + super().setUpClass() # We have a SentencePiece fixture for testing tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True) - tokenizer.save_pretrained(self.tmpdirname) + tokenizer.save_pretrained(cls.tmpdirname) def test_convert_token_and_id(self): """Test ``_convert_token_to_id`` and ``_convert_id_to_token``.""" diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index eba94a45c92..6bc870614db 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -13,6 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import copy +import functools import inspect import itertools import json @@ -24,6 +26,7 @@ import tempfile import traceback import unittest from collections import OrderedDict +from functools import lru_cache from itertools import takewhile from pathlib import Path from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Union @@ -69,6 +72,38 @@ if TYPE_CHECKING: from transformers import PretrainedConfig, PreTrainedModel, TFPreTrainedModel +def use_cache_if_possible(func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + use_cache = kwargs.pop("use_cache", True) + + underline_func = func + if "functools" in str(func): + underline_func = func.__wrapped__ + + if not use_cache: + return underline_func(*args, **kwargs) + if any(not arg.__hash__ for arg in args): + return underline_func(*args, **kwargs) + elif any(not kwarg.__hash__ for kwarg in kwargs.values()): + return underline_func(*args, **kwargs) + + cached = func(*args, **kwargs) + copied = copy.deepcopy(cached) + + if hasattr(copied, "_tokenizer") and "tests.models.clip.test_tokenization_clip.CLIPTokenizationTest" in str( + args[0] + ): + copied._tokenizer = cached._tokenizer + + if hasattr(copied, "sp_model"): + copied.sp_model = cached.sp_model + + return copied + + return wrapper + + logger = logging.get_logger(__name__) NON_ENGLISH_TAGS = ["chinese", "dutch", "french", "finnish", "german", "multilingual"] @@ -198,32 +233,34 @@ class TokenizerTesterMixin: # test_sentencepiece must also be set to True test_sentencepiece_ignore_case = False - def setUp(self) -> None: + @classmethod + def setUpClass(cls) -> None: # Tokenizer.filter makes it possible to filter which Tokenizer to case based on all the # information available in Tokenizer (name, rust class, python class, vocab key name) - self.from_pretrained_id = ( - [self.from_pretrained_id] if isinstance(self.from_pretrained_id, str) else self.from_pretrained_id + cls.from_pretrained_id = ( + [cls.from_pretrained_id] if isinstance(cls.from_pretrained_id, str) else cls.from_pretrained_id ) - self.tokenizers_list = [] - if self.test_rust_tokenizer: - self.tokenizers_list = [ + cls.tokenizers_list = [] + if cls.test_rust_tokenizer: + cls.tokenizers_list = [ ( - self.rust_tokenizer_class, + cls.rust_tokenizer_class, pretrained_id, - self.from_pretrained_kwargs if self.from_pretrained_kwargs is not None else {}, + cls.from_pretrained_kwargs if cls.from_pretrained_kwargs is not None else {}, ) - for pretrained_id in self.from_pretrained_id + for pretrained_id in cls.from_pretrained_id ] else: - self.tokenizers_list = [] + cls.tokenizers_list = [] with open(f"{get_tests_dir()}/fixtures/sample_text.txt", encoding="utf-8") as f_data: - self._data = f_data.read().replace("\n\n", "\n").strip() + cls._data = f_data.read().replace("\n\n", "\n").strip() - self.tmpdirname = tempfile.mkdtemp() + cls.tmpdirname = tempfile.mkdtemp() - def tearDown(self): - shutil.rmtree(self.tmpdirname) + @classmethod + def tearDownClass(cls): + shutil.rmtree(cls.tmpdirname) def get_input_output_texts(self, tokenizer): input_txt = self.get_clean_sequence(tokenizer)[0] @@ -267,11 +304,19 @@ class TokenizerTesterMixin: else: raise ValueError("This tokenizer class has no tokenizer to be tested.") - def get_tokenizer(self, **kwargs) -> PreTrainedTokenizer: - return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_tokenizer(cls, pretrained_name=None, **kwargs) -> PreTrainedTokenizer: + pretrained_name = pretrained_name or cls.tmpdirname + return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs) - def get_rust_tokenizer(self, **kwargs) -> PreTrainedTokenizerFast: - return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) + @classmethod + @use_cache_if_possible + @lru_cache(maxsize=64) + def get_rust_tokenizer(cls, pretrained_name=None, **kwargs) -> PreTrainedTokenizerFast: + pretrained_name = pretrained_name or cls.tmpdirname + return cls.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) def tokenizer_integration_test_util( self, @@ -1263,7 +1308,7 @@ class TokenizerTesterMixin: if not self.test_rust_tokenizer: self.skipTest(reason="No fast tokenizer defined") - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name) + tokenizer_r = self.get_rust_tokenizer(pretrained_name) self._check_no_pad_token_padding(tokenizer_r, conversations) tokenizer_r.padding_side = "right" @@ -1446,7 +1491,7 @@ class TokenizerTesterMixin: if not self.test_rust_tokenizer: self.skipTest(reason="No fast tokenizer defined") - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name) + tokenizer_r = self.get_rust_tokenizer(pretrained_name) # Find where to truncate, as the amount of tokens is different for different tokenizers and I want the # truncation to happen in the middle of the assistant content. @@ -2050,11 +2095,9 @@ class TokenizerTesterMixin: if self.rust_tokenizer_class is not None: pretrained_name = self.from_pretrained_id - slow_tokenizer = self.tokenizer_class.from_pretrained(pretrained_name, legacy=False) + slow_tokenizer = self.get_tokenizer(pretrained_name, legacy=False) with self.subTest(f"{pretrained_name}"): - rust_tokenizer = self.rust_tokenizer_class.from_pretrained( - pretrained_name, from_slow=True, legacy=False - ) + rust_tokenizer = self.get_rust_tokenizer(pretrained_name, from_slow=True, legacy=False) input_full_vocab_ids = list( range(len(slow_tokenizer)) ) # TODO let's maybe shuffle this! And run it 4 times. This way we cover more cmbinations @@ -2200,14 +2243,10 @@ class TokenizerTesterMixin: for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): if self.test_rust_tokenizer: - tokenizer_r = self.rust_tokenizer_class.from_pretrained( - pretrained_name, padding_side="left", **kwargs - ) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, padding_side="left", **kwargs) self.assertEqual(tokenizer_r.padding_side, "left") - tokenizer_r = self.rust_tokenizer_class.from_pretrained( - pretrained_name, padding_side="right", **kwargs - ) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, padding_side="right", **kwargs) self.assertEqual(tokenizer_r.padding_side, "right") self.assertRaises( @@ -2219,10 +2258,10 @@ class TokenizerTesterMixin: ) if self.test_slow_tokenizer: - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, padding_side="left", **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, padding_side="left", **kwargs) self.assertEqual(tokenizer_p.padding_side, "left") - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, padding_side="right", **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, padding_side="right", **kwargs) self.assertEqual(tokenizer_p.padding_side, "right") self.assertRaises( @@ -2237,14 +2276,10 @@ class TokenizerTesterMixin: for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): if self.test_rust_tokenizer: - tokenizer_r = self.rust_tokenizer_class.from_pretrained( - pretrained_name, truncation_side="left", **kwargs - ) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, truncation_side="left", **kwargs) self.assertEqual(tokenizer_r.truncation_side, "left") - tokenizer_r = self.rust_tokenizer_class.from_pretrained( - pretrained_name, truncation_side="right", **kwargs - ) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, truncation_side="right", **kwargs) self.assertEqual(tokenizer_r.truncation_side, "right") self.assertRaises( @@ -2256,14 +2291,10 @@ class TokenizerTesterMixin: ) if self.test_slow_tokenizer: - tokenizer_p = self.tokenizer_class.from_pretrained( - pretrained_name, truncation_side="left", **kwargs - ) + tokenizer_p = self.get_tokenizer(pretrained_name, truncation_side="left", **kwargs) self.assertEqual(tokenizer_p.truncation_side, "left") - tokenizer_p = self.tokenizer_class.from_pretrained( - pretrained_name, truncation_side="right", **kwargs - ) + tokenizer_p = self.get_tokenizer(pretrained_name, truncation_side="right", **kwargs) self.assertEqual(tokenizer_p.truncation_side, "right") self.assertRaises( @@ -3194,18 +3225,18 @@ class TokenizerTesterMixin: def test_is_fast(self): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) # Check is_fast is set correctly self.assertTrue(tokenizer_r.is_fast) if self.test_slow_tokenizer: - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) self.assertFalse(tokenizer_p.is_fast) def test_fast_only_inputs(self): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) # Ensure None raise an error self.assertRaises(TypeError, tokenizer_r.tokenize, None) @@ -3216,7 +3247,7 @@ class TokenizerTesterMixin: def test_alignement_methods(self): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) words = ["Wonderful", "no", "inspiration", "example", "with", "subtoken"] text = " ".join(words) @@ -3446,8 +3477,8 @@ class TokenizerTesterMixin: for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) # Ensure basic input match input_p = tokenizer_p.encode_plus(self._data) @@ -3487,8 +3518,8 @@ class TokenizerTesterMixin: for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) # Check we have the same number of added_tokens for both pair and non-pair inputs. self.assertEqual( @@ -3505,8 +3536,8 @@ class TokenizerTesterMixin: for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) # Check we have the correct max_length for both pair and non-pair inputs. self.assertEqual(tokenizer_r.max_len_single_sentence, tokenizer_p.max_len_single_sentence) @@ -3520,8 +3551,8 @@ class TokenizerTesterMixin: for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): # sometimes the tokenizer saved online is not the same - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) # Assert the set of special tokens match. self.assertSequenceEqual( @@ -3532,7 +3563,7 @@ class TokenizerTesterMixin: def test_add_tokens(self): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) vocab_size = len(tokenizer_r) self.assertEqual(tokenizer_r.add_tokens(""), 0) @@ -3558,7 +3589,7 @@ class TokenizerTesterMixin: def test_offsets_mapping(self): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) text = "Wonderful no inspiration example with subtoken" pair = "Along with an awesome pair" @@ -3601,7 +3632,7 @@ class TokenizerTesterMixin: This needs to be padded so that it can represented as a tensor """ for tokenizer, pretrained_name, kwargs in self.tokenizers_list: - tokenizer = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer = self.get_rust_tokenizer(pretrained_name, **kwargs) with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name}, {tokenizer.__class__.__name__})"): if is_torch_available(): @@ -3663,8 +3694,8 @@ class TokenizerTesterMixin: for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) if hasattr(tokenizer_p, "add_prefix_space") and not tokenizer_p.add_prefix_space: continue # Too hard to test for now @@ -3745,8 +3776,8 @@ class TokenizerTesterMixin: for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) input_simple = [1, 2, 3] input_pair = [1, 2, 3] @@ -3767,8 +3798,8 @@ class TokenizerTesterMixin: for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) # # Input string # input_simple = tokenizer_p.tokenize("This is a sample input", add_special_tokens=False) # input_pair = tokenizer_p.tokenize("This is a sample pair", add_special_tokens=False) @@ -3812,8 +3843,8 @@ class TokenizerTesterMixin: for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id) pad_token_id = tokenizer_p.pad_token_id @@ -4038,8 +4069,8 @@ class TokenizerTesterMixin: for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id) pad_token_id = tokenizer_p.pad_token_id @@ -4076,8 +4107,8 @@ class TokenizerTesterMixin: for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) tmpdirname2 = tempfile.mkdtemp() @@ -4151,8 +4182,8 @@ class TokenizerTesterMixin: for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) sentence = "A, AllenNLP sentence." tokens_r = tokenizer_r.encode_plus( sentence, @@ -4176,7 +4207,7 @@ class TokenizerTesterMixin: def test_compare_add_special_tokens(self): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) simple_num_special_tokens_to_add = tokenizer_r.num_special_tokens_to_add(pair=False) # pair_num_special_tokens_to_add = tokenizer_r.num_special_tokens_to_add(pair=True) @@ -4219,8 +4250,8 @@ class TokenizerTesterMixin: for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs) + tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs) string_sequence = "Asserting that both tokenizers are equal" python_output = tokenizer_p.prepare_for_model( tokenizer_p.encode(string_sequence, add_special_tokens=False) @@ -4235,7 +4266,7 @@ class TokenizerTesterMixin: for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): added_tokens = [AddedToken("", lstrip=True)] - tokenizer_r = self.rust_tokenizer_class.from_pretrained( + tokenizer_r = self.get_rust_tokenizer( pretrained_name, additional_special_tokens=added_tokens, **kwargs ) r_output = tokenizer_r.encode("Hey this is a token") @@ -4246,12 +4277,10 @@ class TokenizerTesterMixin: if self.test_slow_tokenizer: # in rust fast, you lose the information of the AddedToken when initializing with `additional_special_tokens` - tokenizer_cr = self.rust_tokenizer_class.from_pretrained( + tokenizer_cr = self.get_rust_tokenizer( pretrained_name, additional_special_tokens=added_tokens, **kwargs, from_slow=True ) - tokenizer_p = self.tokenizer_class.from_pretrained( - pretrained_name, additional_special_tokens=added_tokens, **kwargs - ) + tokenizer_p = self.get_tokenizer(pretrained_name, additional_special_tokens=added_tokens, **kwargs) p_output = tokenizer_p.encode("Hey this is a token") @@ -4498,7 +4527,7 @@ class TokenizerTesterMixin: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): with tempfile.TemporaryDirectory() as tmp_dir: # Save the fast tokenizer files in a temporary directory - tokenizer_old = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs, use_fast=True) + tokenizer_old = self.get_rust_tokenizer(pretrained_name, **kwargs, use_fast=True) tokenizer_old.save_pretrained(tmp_dir, legacy_format=False) # save only fast version # Initialize toy model for the trainer @@ -4532,13 +4561,11 @@ class TokenizerTesterMixin: with tempfile.TemporaryDirectory() as tmp_dir_1: # Here we check that even if we have initialized a fast tokenizer with a tokenizer_file we can # still save only the slow version and use these saved files to rebuild a tokenizer - tokenizer_fast_old_1 = self.rust_tokenizer_class.from_pretrained( - pretrained_name, **kwargs, use_fast=True - ) + tokenizer_fast_old_1 = self.get_rust_tokenizer(pretrained_name, **kwargs, use_fast=True) tokenizer_file = os.path.join(tmp_dir_1, "tokenizer.json") tokenizer_fast_old_1.backend_tokenizer.save(tokenizer_file) - tokenizer_fast_old_2 = self.rust_tokenizer_class.from_pretrained( + tokenizer_fast_old_2 = self.get_rust_tokenizer( pretrained_name, **kwargs, use_fast=True, tokenizer_file=tokenizer_file ) @@ -4560,10 +4587,10 @@ class TokenizerTesterMixin: special_token = "" special_sentence = f"Hey this is a {special_token} token" with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): - tokenizer_rust = self.rust_tokenizer_class.from_pretrained( + tokenizer_rust = self.get_rust_tokenizer( pretrained_name, additional_special_tokens=[special_token], split_special_tokens=True, **kwargs ) - tokenizer_py = self.tokenizer_class.from_pretrained( + tokenizer_py = self.get_tokenizer( pretrained_name, additional_special_tokens=[special_token], split_special_tokens=True, **kwargs ) @@ -4622,7 +4649,7 @@ class TokenizerTesterMixin: for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): # Load a slow tokenizer from the hub, init with the new token for fast to also include it - tokenizer = self.tokenizer_class.from_pretrained(pretrained_name, eos_token=new_eos) + tokenizer = self.get_tokenizer(pretrained_name, eos_token=new_eos) EXPECTED_ADDED_TOKENS_DECODER = tokenizer.added_tokens_decoder with self.subTest("Hub -> Slow: Test loading a slow tokenizer from the hub)"): self.assertEqual(tokenizer._special_tokens_map["eos_token"], new_eos) @@ -4662,7 +4689,7 @@ class TokenizerTesterMixin: with self.subTest("Hub -> Fast: Test loading a fast tokenizer from the hub)"): if self.rust_tokenizer_class is not None: - tokenizer_fast = self.rust_tokenizer_class.from_pretrained(pretrained_name, eos_token=new_eos) + tokenizer_fast = self.get_rust_tokenizer(pretrained_name, eos_token=new_eos) self.assertEqual(tokenizer_fast._special_tokens_map["eos_token"], new_eos) self.assertIn(new_eos, list(tokenizer_fast.added_tokens_decoder.values())) # We can't test the following because for BC we kept the default rstrip lstrip in slow not fast. Will comment once normalization is alright diff --git a/tests/tokenization/test_tokenization_fast.py b/tests/tokenization/test_tokenization_fast.py index 4bd9b046d40..40b945e272b 100644 --- a/tests/tokenization/test_tokenization_fast.py +++ b/tests/tokenization/test_tokenization_fast.py @@ -33,19 +33,20 @@ class PreTrainedTokenizationFastTest(TokenizerTesterMixin, unittest.TestCase): test_rust_tokenizer = True from_pretrained_vocab_key = "tokenizer_file" - def setUp(self): - self.test_rust_tokenizer = False # because we don't have pretrained_vocab_files_map - super().setUp() - self.test_rust_tokenizer = True + @classmethod + def setUpClass(cls): + cls.test_rust_tokenizer = False # because we don't have pretrained_vocab_files_map + super().setUpClass() + cls.test_rust_tokenizer = True model_paths = ["robot-test/dummy-tokenizer-fast", "robot-test/dummy-tokenizer-wordlevel"] - self.bytelevel_bpe_model_name = "SaulLu/dummy-tokenizer-bytelevel-bpe" + cls.bytelevel_bpe_model_name = "SaulLu/dummy-tokenizer-bytelevel-bpe" # Inclusion of 2 tokenizers to test different types of models (Unigram and WordLevel for the moment) - self.tokenizers_list = [(PreTrainedTokenizerFast, model_path, {}) for model_path in model_paths] + cls.tokenizers_list = [(PreTrainedTokenizerFast, model_path, {}) for model_path in model_paths] tokenizer = PreTrainedTokenizerFast.from_pretrained(model_paths[0]) - tokenizer.save_pretrained(self.tmpdirname) + tokenizer.save_pretrained(cls.tmpdirname) @unittest.skip( "We disable this test for PreTrainedTokenizerFast because it is the only tokenizer that is not linked to any model"