Use lru_cache for tokenization tests (#36818)

* fix

* fix

* fix

* fix

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
Yih-Dar 2025-03-28 15:09:35 +01:00 committed by GitHub
parent 3af425d4c6
commit 1fcaad6df9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
92 changed files with 1301 additions and 884 deletions

View File

@ -34,12 +34,13 @@ class AlbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_sentencepiece = True
test_sentencepiece_ignore_case = True
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
# We have a SentencePiece fixture for testing
tokenizer = AlbertTokenizer(SAMPLE_VOCAB)
tokenizer.save_pretrained(self.tmpdirname)
tokenizer.save_pretrained(cls.tmpdirname)
def get_input_output_texts(self, tokenizer):
input_text = "this is a test"

View File

@ -14,13 +14,14 @@
import json
import os
import unittest
from functools import lru_cache
from transformers import BartTokenizer, BartTokenizerFast, BatchEncoding
from transformers.models.roberta.tokenization_roberta import VOCAB_FILES_NAMES
from transformers.testing_utils import require_tokenizers, require_torch
from transformers.utils import cached_property
from ...test_tokenization_common import TokenizerTesterMixin, filter_roberta_detectors
from ...test_tokenization_common import TokenizerTesterMixin, filter_roberta_detectors, use_cache_if_possible
@require_tokenizers
@ -32,8 +33,10 @@ class TestTokenizationBart(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_filter = filter_roberta_detectors
# from_pretrained_kwargs = {'add_prefix_space': True}
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
vocab = [
"l",
"o",
@ -58,22 +61,30 @@ class TestTokenizationBart(TokenizerTesterMixin, unittest.TestCase):
]
vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
self.special_tokens_map = {"unk_token": "<unk>"}
cls.special_tokens_map = {"unk_token": "<unk>"}
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(self.vocab_file, "w", encoding="utf-8") as fp:
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(cls.vocab_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(vocab_tokens) + "\n")
with open(self.merges_file, "w", encoding="utf-8") as fp:
with open(cls.merges_file, "w", encoding="utf-8") as fp:
fp.write("\n".join(merges))
def get_tokenizer(self, **kwargs):
kwargs.update(self.special_tokens_map)
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
def get_rust_tokenizer(self, **kwargs):
kwargs.update(self.special_tokens_map)
return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return cls.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
def get_input_output_texts(self, tokenizer):
return "lower newer", "lower newer"
@ -154,8 +165,8 @@ class TestTokenizationBart(TokenizerTesterMixin, unittest.TestCase):
def test_embeded_special_tokens(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
sentence = "A, <mask> AllenNLP sentence."
tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)

View File

@ -31,13 +31,14 @@ class BarthezTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = True
test_sentencepiece = True
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
tokenizer = BarthezTokenizerFast.from_pretrained("moussaKam/mbarthez")
tokenizer.save_pretrained(self.tmpdirname)
tokenizer.save_pretrained(self.tmpdirname, legacy_format=False)
self.tokenizer = tokenizer
tokenizer.save_pretrained(cls.tmpdirname)
tokenizer.save_pretrained(cls.tmpdirname, legacy_format=False)
cls.tokenizer = tokenizer
def test_convert_token_and_id(self):
"""Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""

View File

@ -15,11 +15,12 @@
import os
import unittest
from functools import lru_cache
from transformers.models.bartpho.tokenization_bartpho import VOCAB_FILES_NAMES, BartphoTokenizer
from transformers.testing_utils import get_tests_dir
from ...test_tokenization_common import TokenizerTesterMixin
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece_bpe.model")
@ -31,24 +32,29 @@ class BartphoTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = False
test_sentencepiece = True
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
vocab = ["▁This", "▁is", "▁a", "▁t", "est"]
vocab_tokens = dict(zip(vocab, range(len(vocab))))
self.special_tokens_map = {"unk_token": "<unk>"}
cls.special_tokens_map = {"unk_token": "<unk>"}
self.monolingual_vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["monolingual_vocab_file"])
with open(self.monolingual_vocab_file, "w", encoding="utf-8") as fp:
cls.monolingual_vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["monolingual_vocab_file"])
with open(cls.monolingual_vocab_file, "w", encoding="utf-8") as fp:
for token in vocab_tokens:
fp.write(f"{token} {vocab_tokens[token]}\n")
tokenizer = BartphoTokenizer(SAMPLE_VOCAB, self.monolingual_vocab_file, **self.special_tokens_map)
tokenizer.save_pretrained(self.tmpdirname)
tokenizer = BartphoTokenizer(SAMPLE_VOCAB, cls.monolingual_vocab_file, **cls.special_tokens_map)
tokenizer.save_pretrained(cls.tmpdirname)
def get_tokenizer(self, **kwargs):
kwargs.update(self.special_tokens_map)
return BartphoTokenizer.from_pretrained(self.tmpdirname, **kwargs)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return BartphoTokenizer.from_pretrained(pretrained_name, **kwargs)
def get_input_output_texts(self, tokenizer):
input_text = "This is a là test"

View File

@ -41,8 +41,9 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
space_between_special_tokens = True
from_pretrained_filter = filter_non_english
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
vocab_tokens = [
"[UNK]",
@ -61,8 +62,8 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
"low",
"lowest",
]
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
def get_input_output_texts(self, tokenizer):
@ -257,7 +258,7 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_offsets_with_special_characters(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence."
tokens = tokenizer_r.encode_plus(
@ -312,8 +313,8 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
kwargs["tokenize_chinese_chars"] = True
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)
ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
@ -326,8 +327,8 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertListEqual(tokens_without_spe_char_r, list_of_commun_chinese_char)
kwargs["tokenize_chinese_chars"] = False
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)

View File

@ -34,11 +34,12 @@ class BertGenerationTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = False
test_sentencepiece = True
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
tokenizer = BertGenerationTokenizer(SAMPLE_VOCAB, keep_accents=True)
tokenizer.save_pretrained(self.tmpdirname)
tokenizer.save_pretrained(cls.tmpdirname)
def test_convert_token_and_id(self):
"""Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""

View File

@ -17,6 +17,7 @@
import os
import pickle
import unittest
from functools import lru_cache
from transformers import AutoTokenizer
from transformers.models.bert.tokenization_bert import BertTokenizer
@ -31,7 +32,7 @@ from transformers.models.bert_japanese.tokenization_bert_japanese import (
)
from transformers.testing_utils import custom_tokenizers, require_jumanpp, require_sudachi_projection
from ...test_tokenization_common import TokenizerTesterMixin
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
@custom_tokenizers
@ -41,8 +42,9 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = False
space_between_special_tokens = True
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
vocab_tokens = [
"[UNK]",
@ -72,8 +74,8 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
"です",
]
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
def get_input_output_texts(self, tokenizer):
@ -408,17 +410,21 @@ class BertJapaneseCharacterTokenizationTest(TokenizerTesterMixin, unittest.TestC
tokenizer_class = BertJapaneseTokenizer
test_rust_tokenizer = False
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "", "", "", "", "", "", "", "", "", ""]
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
def get_tokenizer(self, **kwargs):
return BertJapaneseTokenizer.from_pretrained(self.tmpdirname, subword_tokenizer_type="character", **kwargs)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs):
return BertJapaneseTokenizer.from_pretrained(cls.tmpdirname, subword_tokenizer_type="character", **kwargs)
def get_input_output_texts(self, tokenizer):
input_text = "こんにちは、世界。 \nこんばんは、世界。"

View File

@ -15,10 +15,11 @@
import os
import unittest
from functools import lru_cache
from transformers.models.bertweet.tokenization_bertweet import VOCAB_FILES_NAMES, BertweetTokenizer
from ...test_tokenization_common import TokenizerTesterMixin
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
class BertweetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
@ -26,26 +27,31 @@ class BertweetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = BertweetTokenizer
test_rust_tokenizer = False
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
vocab = ["I", "m", "V@@", "R@@", "r", "e@@"]
vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["#version: 0.2", "a m</w>"]
self.special_tokens_map = {"unk_token": "<unk>"}
cls.special_tokens_map = {"unk_token": "<unk>"}
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(self.vocab_file, "w", encoding="utf-8") as fp:
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(cls.vocab_file, "w", encoding="utf-8") as fp:
for token in vocab_tokens:
fp.write(f"{token} {vocab_tokens[token]}\n")
with open(self.merges_file, "w", encoding="utf-8") as fp:
with open(cls.merges_file, "w", encoding="utf-8") as fp:
fp.write("\n".join(merges))
def get_tokenizer(self, **kwargs):
kwargs.update(self.special_tokens_map)
return BertweetTokenizer.from_pretrained(self.tmpdirname, **kwargs)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return BertweetTokenizer.from_pretrained(pretrained_name, **kwargs)
def get_input_output_texts(self, tokenizer):
input_text = "I am VinAI Research"

View File

@ -36,11 +36,12 @@ class BigBirdTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = True
test_sentencepiece = True
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
tokenizer = self.tokenizer_class(SAMPLE_VOCAB, keep_accents=True)
tokenizer.save_pretrained(self.tmpdirname)
tokenizer = cls.tokenizer_class(SAMPLE_VOCAB, keep_accents=True)
tokenizer.save_pretrained(cls.tmpdirname)
def test_convert_token_and_id(self):
"""Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""

View File

@ -30,8 +30,9 @@ class BioGptTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = BioGptTokenizer
test_rust_tokenizer = False
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
vocab = [
@ -60,11 +61,11 @@ class BioGptTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""]
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(self.vocab_file, "w") as fp:
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(cls.vocab_file, "w") as fp:
fp.write(json.dumps(vocab_tokens))
with open(self.merges_file, "w") as fp:
with open(cls.merges_file, "w") as fp:
fp.write("\n".join(merges))
def get_input_output_texts(self, tokenizer):

View File

@ -18,13 +18,14 @@
import json
import os
import unittest
from functools import lru_cache
from transformers.models.blenderbot_small.tokenization_blenderbot_small import (
VOCAB_FILES_NAMES,
BlenderbotSmallTokenizer,
)
from ...test_tokenization_common import TokenizerTesterMixin
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
class BlenderbotSmallTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
@ -32,25 +33,30 @@ class BlenderbotSmallTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = BlenderbotSmallTokenizer
test_rust_tokenizer = False
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
vocab = ["__start__", "adapt", "act", "ap@@", "te", "__end__", "__unk__"]
vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["#version: 0.2", "a p", "t e</w>", "ap t</w>", "a d", "ad apt</w>", "a c", "ac t</w>", ""]
self.special_tokens_map = {"unk_token": "__unk__", "bos_token": "__start__", "eos_token": "__end__"}
cls.special_tokens_map = {"unk_token": "__unk__", "bos_token": "__start__", "eos_token": "__end__"}
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(self.vocab_file, "w", encoding="utf-8") as fp:
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(cls.vocab_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(vocab_tokens) + "\n")
with open(self.merges_file, "w", encoding="utf-8") as fp:
with open(cls.merges_file, "w", encoding="utf-8") as fp:
fp.write("\n".join(merges))
def get_tokenizer(self, **kwargs):
kwargs.update(self.special_tokens_map)
return BlenderbotSmallTokenizer.from_pretrained(self.tmpdirname, **kwargs)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return BlenderbotSmallTokenizer.from_pretrained(pretrained_name, **kwargs)
def get_input_output_texts(self, tokenizer):
input_text = "adapt act apte"

View File

@ -13,14 +13,16 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
import unittest
from functools import lru_cache
from datasets import load_dataset
from transformers import BloomTokenizerFast
from transformers.testing_utils import require_jinja, require_tokenizers
from ...test_tokenization_common import TokenizerTesterMixin
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
@require_tokenizers
@ -34,14 +36,21 @@ class BloomTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_vocab_key = "tokenizer_file"
special_tokens_map = {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
tokenizer = BloomTokenizerFast.from_pretrained("bigscience/tokenizer")
tokenizer.save_pretrained(self.tmpdirname)
tokenizer.save_pretrained(cls.tmpdirname)
def get_rust_tokenizer(self, **kwargs):
kwargs.update(self.special_tokens_map)
return BloomTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
_kwargs = copy.deepcopy(cls.special_tokens_map)
_kwargs.update(kwargs)
kwargs = _kwargs
pretrained_name = pretrained_name or cls.tmpdirname
return BloomTokenizerFast.from_pretrained(pretrained_name, **kwargs)
@unittest.skip(reason="This needs a slow tokenizer. Bloom does not have one!")
def test_encode_decode_with_spaces(self):
@ -65,7 +74,7 @@ class BloomTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_padding(self, max_length=6):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
# tokenizer_r.pad_token = None # Hotfixing padding = None
# Simple input
s = "This is a simple input"

View File

@ -19,12 +19,13 @@ import re
import shutil
import tempfile
import unittest
from functools import lru_cache
from typing import Tuple
from transformers import AddedToken, BatchEncoding, ByT5Tokenizer
from transformers.utils import cached_property, is_tf_available, is_torch_available
from ...test_tokenization_common import TokenizerTesterMixin
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
if is_torch_available():
@ -39,17 +40,22 @@ class ByT5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = ByT5Tokenizer
test_rust_tokenizer = False
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
tokenizer = ByT5Tokenizer()
tokenizer.save_pretrained(self.tmpdirname)
tokenizer.save_pretrained(cls.tmpdirname)
@cached_property
def t5_base_tokenizer(self):
return ByT5Tokenizer.from_pretrained("google/byt5-small")
def get_tokenizer(self, **kwargs) -> ByT5Tokenizer:
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs) -> ByT5Tokenizer:
pretrained_name = pretrained_name or cls.tmpdirname
return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
def get_clean_sequence(self, tokenizer, with_prefix_space=False, max_length=20, min_length=5) -> Tuple[str, list]:
# XXX The default common tokenizer tests assume that every ID is decodable on its own.

View File

@ -15,6 +15,7 @@
import tempfile
import unittest
from tempfile import TemporaryDirectory
from transformers import AddedToken, CamembertTokenizer, CamembertTokenizerFast
from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, slow
@ -38,12 +39,13 @@ class CamembertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = True
test_sentencepiece = True
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
# We have a SentencePiece fixture for testing
tokenizer = CamembertTokenizer(SAMPLE_VOCAB)
tokenizer.save_pretrained(self.tmpdirname)
tokenizer.save_pretrained(cls.tmpdirname)
@unittest.skip(
"Token maps are not equal because someone set the probability of ('<unk>NOTUSED', -100), so it's never encoded for fast"
@ -72,8 +74,9 @@ class CamembertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_rust_and_python_bpe_tokenizers(self):
tokenizer = CamembertTokenizer(SAMPLE_BPE_VOCAB)
tokenizer.save_pretrained(self.tmpdirname)
rust_tokenizer = CamembertTokenizerFast.from_pretrained(self.tmpdirname)
with TemporaryDirectory() as tmpdirname:
tokenizer.save_pretrained(tmpdirname)
rust_tokenizer = CamembertTokenizerFast.from_pretrained(tmpdirname)
sequence = "I was born in 92000, and this is falsé."
@ -147,11 +150,11 @@ class CamembertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertTrue(all(item in tokenizer.added_tokens_decoder.items() for item in expected.items()))
return tokenizer
new_eos = AddedToken("[NEW_EOS]", rstrip=False, lstrip=True, normalized=False)
new_eos = AddedToken("[NEW_EOS]", rstrip=False, lstrip=True, normalized=False, special=True)
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
# Load a slow tokenizer from the hub, init with the new token for fast to also include it
tokenizer = self.tokenizer_class.from_pretrained(pretrained_name, eos_token=new_eos)
tokenizer = self.get_tokenizer(pretrained_name, eos_token=new_eos)
EXPECTED_ADDED_TOKENS_DECODER = tokenizer.added_tokens_decoder
with self.subTest("Hub -> Slow: Test loading a slow tokenizer from the hub)"):
self.assertEqual(tokenizer._special_tokens_map["eos_token"], new_eos)
@ -191,9 +194,7 @@ class CamembertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
with self.subTest("Hub -> Fast: Test loading a fast tokenizer from the hub)"):
if self.rust_tokenizer_class is not None:
tokenizer_fast = self.rust_tokenizer_class.from_pretrained(
pretrained_name, eos_token=new_eos, from_slow=True
)
tokenizer_fast = self.get_rust_tokenizer(pretrained_name, eos_token=new_eos, from_slow=True)
self.assertEqual(tokenizer_fast._special_tokens_map["eos_token"], new_eos)
self.assertIn(new_eos, list(tokenizer_fast.added_tokens_decoder.values()))
# We can't test the following because for BC we kept the default rstrip lstrip in slow not fast. Will comment once normalization is alright

View File

@ -18,13 +18,14 @@ import os
import shutil
import tempfile
import unittest
from functools import lru_cache
from transformers import BatchEncoding, CanineTokenizer
from transformers.testing_utils import require_tokenizers, require_torch
from transformers.tokenization_utils import AddedToken
from transformers.utils import cached_property
from ...test_tokenization_common import TokenizerTesterMixin
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
class CanineTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
@ -32,17 +33,22 @@ class CanineTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = CanineTokenizer
test_rust_tokenizer = False
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
tokenizer = CanineTokenizer()
tokenizer.save_pretrained(self.tmpdirname)
tokenizer.save_pretrained(cls.tmpdirname)
@cached_property
def canine_tokenizer(self):
return CanineTokenizer.from_pretrained("google/canine-s")
def get_tokenizer(self, **kwargs) -> CanineTokenizer:
tokenizer = self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs) -> CanineTokenizer:
pretrained_name = pretrained_name or cls.tmpdirname
tokenizer = cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer._unicode_vocab_size = 1024
return tokenizer

View File

@ -17,12 +17,13 @@
import json
import os
import unittest
from functools import lru_cache
from transformers import CLIPTokenizer, CLIPTokenizerFast
from transformers.models.clip.tokenization_clip import VOCAB_FILES_NAMES
from transformers.testing_utils import require_ftfy, require_tokenizers
from ...test_tokenization_common import TokenizerTesterMixin
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
@require_tokenizers
@ -34,28 +35,37 @@ class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_kwargs = {}
test_seq2seq = False
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "lo", "l</w>", "w</w>", "r</w>", "t</w>", "low</w>", "er</w>", "lowest</w>", "newer</w>", "wider", "<unk>", "<|startoftext|>", "<|endoftext|>"] # fmt: skip
vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["#version: 0.2", "l o", "lo w</w>", "e r</w>"]
self.special_tokens_map = {"unk_token": "<unk>"}
cls.special_tokens_map = {"unk_token": "<unk>"}
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(self.vocab_file, "w", encoding="utf-8") as fp:
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(cls.vocab_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(vocab_tokens) + "\n")
with open(self.merges_file, "w", encoding="utf-8") as fp:
with open(cls.merges_file, "w", encoding="utf-8") as fp:
fp.write("\n".join(merges))
def get_tokenizer(self, **kwargs):
kwargs.update(self.special_tokens_map)
return CLIPTokenizer.from_pretrained(self.tmpdirname, **kwargs)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return CLIPTokenizer.from_pretrained(pretrained_name, **kwargs)
def get_rust_tokenizer(self, **kwargs):
kwargs.update(self.special_tokens_map)
return CLIPTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return CLIPTokenizerFast.from_pretrained(pretrained_name, **kwargs)
def get_input_output_texts(self, tokenizer):
input_text = "lower newer"
@ -77,8 +87,8 @@ class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_check_encoding_slow_fast(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_s = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_s = self.get_tokenizer(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
text = "A\n'll 11p223RF☆ho!!to?'d'd''d of a cat to-$''d."
text_tokenized_s = tokenizer_s.tokenize(text)
@ -138,7 +148,7 @@ class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
text_of_1_token = "hello" # `hello` is a token in the vocabulary of `pretrained_name`
text = f"{text_of_1_token} {text_of_1_token}"
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
tokenizer_r = self.get_rust_tokenizer(
pretrained_name,
use_fast=True,
)
@ -151,7 +161,7 @@ class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
text = f" {text}"
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
tokenizer_r = self.get_rust_tokenizer(
pretrained_name,
use_fast=True,
)
@ -166,7 +176,7 @@ class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
# Test related to the breaking change introduced in transformers v4.17.0
# We need to check that an error in raised when the user try to load a previous version of the tokenizer.
with self.assertRaises(ValueError) as context:
self.rust_tokenizer_class.from_pretrained("robot-test/old-clip-tokenizer")
self.get_rust_tokenizer("robot-test/old-clip-tokenizer")
self.assertTrue(
context.exception.args[0].startswith(

View File

@ -17,11 +17,12 @@
import json
import os
import unittest
from functools import lru_cache
from typing import List
from transformers import ClvpTokenizer
from ...test_tokenization_common import TokenizerTesterMixin, slow
from ...test_tokenization_common import TokenizerTesterMixin, slow, use_cache_if_possible
class ClvpTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
@ -32,8 +33,9 @@ class ClvpTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_seq2seq = False
test_sentencepiece_ignore_case = True
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
vocab = [
@ -62,19 +64,23 @@ class ClvpTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
]
vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
self.special_tokens_map = {"unk_token": "<unk>"}
cls.special_tokens_map = {"unk_token": "<unk>"}
self.vocab_file = os.path.join(self.tmpdirname, "vocab.json")
self.merges_file = os.path.join(self.tmpdirname, "merges.txt")
with open(self.vocab_file, "w", encoding="utf-8") as fp:
cls.vocab_file = os.path.join(cls.tmpdirname, "vocab.json")
cls.merges_file = os.path.join(cls.tmpdirname, "merges.txt")
with open(cls.vocab_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(vocab_tokens) + "\n")
with open(self.merges_file, "w", encoding="utf-8") as fp:
with open(cls.merges_file, "w", encoding="utf-8") as fp:
fp.write("\n".join(merges))
# Copied from transformers.tests.models.gpt2.test_tokenization_gpt2.GPT2TokenizationTest.get_tokenizer with GPT2->Clvp
def get_tokenizer(self, **kwargs):
kwargs.update(self.special_tokens_map)
return ClvpTokenizer.from_pretrained(self.tmpdirname, **kwargs)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return ClvpTokenizer.from_pretrained(pretrained_name, **kwargs)
# Copied from transformers.tests.models.gpt2.test_tokenization_gpt2.GPT2TokenizationTest.get_input_output_texts
def get_input_output_texts(self, tokenizer):
@ -134,7 +140,7 @@ class ClvpTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_padding(self, max_length=15):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
# Simple input
s = "This is a simple input"

View File

@ -53,15 +53,16 @@ class CodeLlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_sentencepiece = True
from_pretrained_kwargs = {}
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
# We have a SentencePiece fixture for testing
tokenizer = CodeLlamaTokenizer(SAMPLE_VOCAB, keep_accents=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.save_pretrained(self.tmpdirname)
tokenizer.save_pretrained(cls.tmpdirname)
def get_tokenizers(self, **kwargs):
def get_tokenizers(cls, **kwargs):
kwargs.update({"pad_token": "<PAD>"})
return super().get_tokenizers(**kwargs)
@ -151,8 +152,8 @@ class CodeLlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
]
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
tmpdirname2 = tempfile.mkdtemp()
@ -255,7 +256,7 @@ class CodeLlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
added_tokens = [AddedToken("<special>", lstrip=True)]
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
tokenizer_r = self.get_rust_tokenizer(
pretrained_name, additional_special_tokens=added_tokens, **kwargs
)
r_output = tokenizer_r.encode("Hey this is a <special> token")
@ -265,7 +266,7 @@ class CodeLlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertTrue(special_token_id in r_output)
if self.test_slow_tokenizer:
tokenizer_cr = self.rust_tokenizer_class.from_pretrained(
tokenizer_cr = self.get_rust_tokenizer(
pretrained_name,
additional_special_tokens=added_tokens,
**kwargs, # , from_slow=True <- unfortunately too slow to convert

View File

@ -18,12 +18,13 @@ import json
import os
import re
import unittest
from functools import lru_cache
from transformers import CodeGenTokenizer, CodeGenTokenizerFast
from transformers.models.codegen.tokenization_codegen import VOCAB_FILES_NAMES
from transformers.testing_utils import require_tokenizers, slow
from ...test_tokenization_common import TokenizerTesterMixin
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
@require_tokenizers
@ -35,8 +36,9 @@ class CodeGenTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_kwargs = {"add_prefix_space": True}
test_seq2seq = False
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
vocab = [
@ -64,22 +66,30 @@ class CodeGenTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
]
vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
self.special_tokens_map = {"unk_token": "<unk>"}
cls.special_tokens_map = {"unk_token": "<unk>"}
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(self.vocab_file, "w", encoding="utf-8") as fp:
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(cls.vocab_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(vocab_tokens) + "\n")
with open(self.merges_file, "w", encoding="utf-8") as fp:
with open(cls.merges_file, "w", encoding="utf-8") as fp:
fp.write("\n".join(merges))
def get_tokenizer(self, **kwargs):
kwargs.update(self.special_tokens_map)
return CodeGenTokenizer.from_pretrained(self.tmpdirname, **kwargs)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return CodeGenTokenizer.from_pretrained(pretrained_name, **kwargs)
def get_rust_tokenizer(self, **kwargs):
kwargs.update(self.special_tokens_map)
return CodeGenTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return CodeGenTokenizerFast.from_pretrained(pretrained_name, **kwargs)
def get_input_output_texts(self, tokenizer):
input_text = "lower newer"
@ -136,7 +146,7 @@ class CodeGenTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_padding(self, max_length=15):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
# Simple input
s = "This is a simple input"

View File

@ -13,12 +13,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
import unittest
from functools import lru_cache
from transformers import CohereTokenizerFast
from transformers.testing_utils import require_jinja, require_tokenizers, require_torch_multi_gpu
from ...test_tokenization_common import TokenizerTesterMixin
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
@require_tokenizers
@ -37,14 +39,21 @@ class CohereTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
"pad_token": "<PAD>",
}
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
tokenizer = CohereTokenizerFast.from_pretrained("hf-internal-testing/tiny-random-CohereForCausalLM")
tokenizer.save_pretrained(self.tmpdirname)
tokenizer.save_pretrained(cls.tmpdirname)
def get_rust_tokenizer(self, **kwargs):
kwargs.update(self.special_tokens_map)
return CohereTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
_kwargs = copy.deepcopy(cls.special_tokens_map)
_kwargs.update(kwargs)
kwargs = _kwargs
pretrained_name = pretrained_name or cls.tmpdirname
return CohereTokenizerFast.from_pretrained(pretrained_name, **kwargs)
# This gives CPU OOM on a single-gpu runner (~60G RAM). On multi-gpu runner, it has ~180G RAM which is enough.
@require_torch_multi_gpu
@ -80,7 +89,7 @@ class CohereTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_padding(self, max_length=10):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
# tokenizer_r.pad_token = None # Hotfixing padding = None
# Simple input
s = "This is a simple input"

View File

@ -28,8 +28,9 @@ class CPMAntTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = CpmAntTokenizer
test_rust_tokenizer = False
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
vocab_tokens = [
"<d>",
@ -49,8 +50,8 @@ class CPMAntTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
"n",
"t",
]
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
@tooslow

View File

@ -16,10 +16,11 @@
import json
import os
import unittest
from functools import lru_cache
from transformers.models.ctrl.tokenization_ctrl import VOCAB_FILES_NAMES, CTRLTokenizer
from ...test_tokenization_common import TokenizerTesterMixin
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
class CTRLTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
@ -28,25 +29,30 @@ class CTRLTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = False
test_seq2seq = False
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
vocab = ["adapt", "re@@", "a@@", "apt", "c@@", "t", "<unk>"]
vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["#version: 0.2", "a p", "ap t</w>", "r e", "a d", "ad apt</w>", ""]
self.special_tokens_map = {"unk_token": "<unk>"}
cls.special_tokens_map = {"unk_token": "<unk>"}
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(self.vocab_file, "w", encoding="utf-8") as fp:
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(cls.vocab_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(vocab_tokens) + "\n")
with open(self.merges_file, "w", encoding="utf-8") as fp:
with open(cls.merges_file, "w", encoding="utf-8") as fp:
fp.write("\n".join(merges))
def get_tokenizer(self, **kwargs):
kwargs.update(self.special_tokens_map)
return CTRLTokenizer.from_pretrained(self.tmpdirname, **kwargs)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return CTRLTokenizer.from_pretrained(pretrained_name, **kwargs)
def get_input_output_texts(self, tokenizer):
input_text = "adapt react readapt apt"

View File

@ -17,12 +17,13 @@
import json
import os
import unittest
from functools import lru_cache
from transformers import DebertaTokenizer, DebertaTokenizerFast
from transformers.models.deberta.tokenization_deberta import VOCAB_FILES_NAMES
from transformers.testing_utils import slow
from ...test_tokenization_common import TokenizerTesterMixin
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
class DebertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
@ -31,8 +32,9 @@ class DebertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = True
rust_tokenizer_class = DebertaTokenizerFast
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
vocab = [
@ -59,18 +61,22 @@ class DebertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
]
vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
self.special_tokens_map = {"unk_token": "[UNK]"}
cls.special_tokens_map = {"unk_token": "[UNK]"}
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(self.vocab_file, "w", encoding="utf-8") as fp:
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(cls.vocab_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(vocab_tokens) + "\n")
with open(self.merges_file, "w", encoding="utf-8") as fp:
with open(cls.merges_file, "w", encoding="utf-8") as fp:
fp.write("\n".join(merges))
def get_tokenizer(self, **kwargs):
kwargs.update(self.special_tokens_map)
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
def get_input_output_texts(self, tokenizer):
input_text = "lower newer"

View File

@ -33,12 +33,13 @@ class DebertaV2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_sentencepiece = True
test_sentencepiece_ignore_case = True
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
# We have a SentencePiece fixture for testing
tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="<unk>")
tokenizer.save_pretrained(self.tmpdirname)
tokenizer.save_pretrained(cls.tmpdirname)
def get_input_output_texts(self, tokenizer):
input_text = "this is a test"

View File

@ -17,11 +17,11 @@
from transformers import DistilBertTokenizer, DistilBertTokenizerFast
from transformers.testing_utils import require_tokenizers, slow
from ..bert.test_tokenization_bert import BertTokenizationTest
from ..bert import test_tokenization_bert
@require_tokenizers
class DistilBertTokenizationTest(BertTokenizationTest):
class DistilBertTokenizationTest(test_tokenization_bert.BertTokenizationTest):
tokenizer_class = DistilBertTokenizer
rust_tokenizer_class = DistilBertTokenizerFast
test_rust_tokenizer = True

View File

@ -25,11 +25,11 @@ from transformers import (
from transformers.testing_utils import require_tokenizers, slow
from transformers.tokenization_utils_base import BatchEncoding
from ..bert.test_tokenization_bert import BertTokenizationTest
from ..bert import test_tokenization_bert
@require_tokenizers
class DPRContextEncoderTokenizationTest(BertTokenizationTest):
class DPRContextEncoderTokenizationTest(test_tokenization_bert.BertTokenizationTest):
tokenizer_class = DPRContextEncoderTokenizer
rust_tokenizer_class = DPRContextEncoderTokenizerFast
test_rust_tokenizer = True
@ -37,7 +37,7 @@ class DPRContextEncoderTokenizationTest(BertTokenizationTest):
@require_tokenizers
class DPRQuestionEncoderTokenizationTest(BertTokenizationTest):
class DPRQuestionEncoderTokenizationTest(test_tokenization_bert.BertTokenizationTest):
tokenizer_class = DPRQuestionEncoderTokenizer
rust_tokenizer_class = DPRQuestionEncoderTokenizerFast
test_rust_tokenizer = True
@ -45,7 +45,7 @@ class DPRQuestionEncoderTokenizationTest(BertTokenizationTest):
@require_tokenizers
class DPRReaderTokenizationTest(BertTokenizationTest):
class DPRReaderTokenizationTest(test_tokenization_bert.BertTokenizationTest):
tokenizer_class = DPRReaderTokenizer
rust_tokenizer_class = DPRReaderTokenizerFast
test_rust_tokenizer = True

View File

@ -40,8 +40,9 @@ class ElectraTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
space_between_special_tokens = True
from_pretrained_filter = filter_non_english
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
vocab_tokens = [
"[UNK]",
@ -60,8 +61,8 @@ class ElectraTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
"low",
"lowest",
]
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
def get_input_output_texts(self, tokenizer):
@ -250,7 +251,7 @@ class ElectraTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_offsets_with_special_characters(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence."
tokens = tokenizer_r.encode_plus(
@ -305,8 +306,8 @@ class ElectraTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
kwargs["tokenize_chinese_chars"] = True
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)
ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
@ -319,8 +320,8 @@ class ElectraTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertListEqual(tokens_without_spe_char_r, list_of_commun_chinese_char)
kwargs["tokenize_chinese_chars"] = False
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)

View File

@ -17,6 +17,7 @@
import os
import tempfile
import unittest
from functools import lru_cache
from typing import List
from transformers.models.esm.tokenization_esm import VOCAB_FILES_NAMES, EsmTokenizer
@ -24,24 +25,32 @@ from transformers.testing_utils import require_tokenizers
from transformers.tokenization_utils import PreTrainedTokenizer
from transformers.tokenization_utils_base import PreTrainedTokenizerBase
from ...test_tokenization_common import use_cache_if_possible
@require_tokenizers
class ESMTokenizationTest(unittest.TestCase):
tokenizer_class = EsmTokenizer
def setUp(self):
super().setUp()
self.tmpdirname = tempfile.mkdtemp()
@classmethod
def setUpClass(cls):
super().setUpClass()
cls.tmpdirname = tempfile.mkdtemp()
vocab_tokens: List[str] = ["<cls>", "<pad>", "<eos>", "<unk>", "L", "A", "G", "V", "S", "E", "R", "T", "I", "D", "P", "K", "Q", "N", "F", "Y", "M", "H", "W", "C", "X", "B", "U", "Z", "O", ".", "-", "<null_1>", "<mask>"] # fmt: skip
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
def get_tokenizers(self, **kwargs) -> List[PreTrainedTokenizerBase]:
return [self.get_tokenizer(**kwargs)]
def get_tokenizers(cls, **kwargs) -> List[PreTrainedTokenizerBase]:
return [cls.get_tokenizer(**kwargs)]
def get_tokenizer(self, **kwargs) -> PreTrainedTokenizer:
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs) -> PreTrainedTokenizer:
pretrained_name = pretrained_name or cls.tmpdirname
return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
def test_tokenizer_single_example(self):
tokenizer = self.tokenizer_class(self.vocab_file)

View File

@ -28,10 +28,11 @@ class FastSpeech2ConformerTokenizerTest(TokenizerTesterMixin, unittest.TestCase)
tokenizer_class = FastSpeech2ConformerTokenizer
test_rust_tokenizer = False
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
tokenizer = FastSpeech2ConformerTokenizer.from_pretrained("espnet/fastspeech2_conformer")
tokenizer.save_pretrained(self.tmpdirname)
tokenizer.save_pretrained(cls.tmpdirname)
def get_input_output_texts(self, tokenizer):
input_text = "this is a test"

View File

@ -30,8 +30,9 @@ class FlaubertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = FlaubertTokenizer
test_rust_tokenizer = False
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "w</w>", "r</w>", "t</w>", "i</w>", "lo", "low", "ne", "new", "er</w>", "low</w>", "lowest</w>", "new</w>", "newer</w>", "wider</w>", "<unk>"] # fmt: skip
@ -39,11 +40,11 @@ class FlaubertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["n e 300", "ne w 301", "e r</w> 302", ""]
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(self.vocab_file, "w", encoding="utf-8") as fp:
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(cls.vocab_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(vocab_tokens) + "\n")
with open(self.merges_file, "w", encoding="utf-8") as fp:
with open(cls.merges_file, "w", encoding="utf-8") as fp:
fp.write("\n".join(merges))
# Copied from transformers.tests.models.xlm.test_tokenization_xlm.XLMTokenizationTest.test_full_tokenizer

View File

@ -36,12 +36,13 @@ class FNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_sentencepiece_ignore_case = True
test_seq2seq = False
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
# We have a SentencePiece fixture for testing
tokenizer = FNetTokenizer(SAMPLE_VOCAB)
tokenizer.save_pretrained(self.tmpdirname)
tokenizer.save_pretrained(cls.tmpdirname)
def get_input_output_texts(self, tokenizer):
input_text = "this is a test"
@ -147,7 +148,7 @@ class FNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
added_tokens = [AddedToken("<special>", lstrip=True)]
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
tokenizer_r = self.get_rust_tokenizer(
pretrained_name, additional_special_tokens=added_tokens, **kwargs
)
r_output = tokenizer_r.encode("Hey this is a <special> token")
@ -175,7 +176,7 @@ class FNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
added_tokens = [AddedToken("<special>", lstrip=True)]
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
tokenizer_r = self.get_rust_tokenizer(
pretrained_name, additional_special_tokens=added_tokens, **kwargs, from_slow=True
)
special_token_id = tokenizer_r.encode("<special>", add_special_tokens=False)[0]
@ -198,8 +199,8 @@ class FNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
pad_token_id = tokenizer_p.pad_token_id

View File

@ -34,8 +34,9 @@ class FSMTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = FSMTTokenizer
test_rust_tokenizer = False
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
vocab = [
@ -64,22 +65,22 @@ class FSMTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""]
self.langs = ["en", "ru"]
cls.langs = ["en", "ru"]
config = {
"langs": self.langs,
"langs": cls.langs,
"src_vocab_size": 10,
"tgt_vocab_size": 20,
}
self.src_vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["src_vocab_file"])
self.tgt_vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["tgt_vocab_file"])
config_file = os.path.join(self.tmpdirname, "tokenizer_config.json")
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(self.src_vocab_file, "w") as fp:
cls.src_vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["src_vocab_file"])
cls.tgt_vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["tgt_vocab_file"])
config_file = os.path.join(cls.tmpdirname, "tokenizer_config.json")
cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(cls.src_vocab_file, "w") as fp:
fp.write(json.dumps(vocab_tokens))
with open(self.tgt_vocab_file, "w") as fp:
with open(cls.tgt_vocab_file, "w") as fp:
fp.write(json.dumps(vocab_tokens))
with open(self.merges_file, "w") as fp:
with open(cls.merges_file, "w") as fp:
fp.write("\n".join(merges))
with open(config_file, "w") as fp:
fp.write(json.dumps(config))

View File

@ -16,12 +16,13 @@
import os
import unittest
from functools import lru_cache
from transformers import FunnelTokenizer, FunnelTokenizerFast
from transformers.models.funnel.tokenization_funnel import VOCAB_FILES_NAMES
from transformers.testing_utils import require_tokenizers
from ...test_tokenization_common import TokenizerTesterMixin
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
@require_tokenizers
@ -32,8 +33,9 @@ class FunnelTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = True
space_between_special_tokens = True
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
vocab_tokens = [
"<unk>",
@ -50,15 +52,23 @@ class FunnelTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
"low",
"lowest",
]
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
def get_tokenizer(self, **kwargs):
return FunnelTokenizer.from_pretrained(self.tmpdirname, **kwargs)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs):
pretrained_name = pretrained_name or cls.tmpdirname
return FunnelTokenizer.from_pretrained(pretrained_name, **kwargs)
def get_rust_tokenizer(self, **kwargs):
return FunnelTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
pretrained_name = pretrained_name or cls.tmpdirname
return FunnelTokenizerFast.from_pretrained(pretrained_name, **kwargs)
def get_input_output_texts(self, tokenizer):
input_text = "UNwant\u00e9d,running"

View File

@ -53,12 +53,13 @@ class GemmaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_sentencepiece = True
from_pretrained_kwargs = {}
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
# We have a SentencePiece fixture for testing
tokenizer = GemmaTokenizer(SAMPLE_VOCAB, keep_accents=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.save_pretrained(self.tmpdirname)
tokenizer.save_pretrained(cls.tmpdirname)
@require_torch
def test_batch_tokenization(self):
@ -103,7 +104,7 @@ class GemmaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
added_tokens = [AddedToken("<special>", lstrip=True)]
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
tokenizer_r = self.get_rust_tokenizer(
pretrained_name, additional_special_tokens=added_tokens, **kwargs
)
r_output = tokenizer_r.encode("Hey this is a <special> token")
@ -113,7 +114,7 @@ class GemmaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertTrue(special_token_id in r_output)
if self.test_slow_tokenizer:
tokenizer_cr = self.rust_tokenizer_class.from_pretrained(
tokenizer_cr = self.get_rust_tokenizer(
pretrained_name,
additional_special_tokens=added_tokens,
**kwargs, # , from_slow=True <- unfortunately too slow to convert

View File

@ -17,12 +17,13 @@
import json
import os
import unittest
from functools import lru_cache
from transformers import AutoTokenizer, GPT2Tokenizer, GPT2TokenizerFast
from transformers.models.gpt2.tokenization_gpt2 import VOCAB_FILES_NAMES
from transformers.testing_utils import require_jinja, require_tiktoken, require_tokenizers
from ...test_tokenization_common import TokenizerTesterMixin
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
@require_tokenizers
@ -34,8 +35,9 @@ class GPT2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_kwargs = {"add_prefix_space": True}
test_seq2seq = False
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
vocab = [
@ -63,22 +65,30 @@ class GPT2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
]
vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
self.special_tokens_map = {"unk_token": "<unk>"}
cls.special_tokens_map = {"unk_token": "<unk>"}
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(self.vocab_file, "w", encoding="utf-8") as fp:
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(cls.vocab_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(vocab_tokens) + "\n")
with open(self.merges_file, "w", encoding="utf-8") as fp:
with open(cls.merges_file, "w", encoding="utf-8") as fp:
fp.write("\n".join(merges))
def get_tokenizer(self, **kwargs):
kwargs.update(self.special_tokens_map)
return GPT2Tokenizer.from_pretrained(self.tmpdirname, **kwargs)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return GPT2Tokenizer.from_pretrained(pretrained_name, **kwargs)
def get_rust_tokenizer(self, **kwargs):
kwargs.update(self.special_tokens_map)
return GPT2TokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return GPT2TokenizerFast.from_pretrained(pretrained_name, **kwargs)
def get_input_output_texts(self, tokenizer):
input_text = "lower newer"
@ -135,7 +145,7 @@ class GPT2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_padding(self, max_length=15):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
# Simple input
s = "This is a simple input"

View File

@ -17,6 +17,7 @@
import json
import os
import unittest
from functools import lru_cache
from transformers.models.gpt_neox_japanese.tokenization_gpt_neox_japanese import (
VOCAB_FILES_NAMES,
@ -24,7 +25,7 @@ from transformers.models.gpt_neox_japanese.tokenization_gpt_neox_japanese import
)
from transformers.testing_utils import require_tokenizers, slow
from ...test_tokenization_common import TokenizerTesterMixin
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
@require_tokenizers
@ -34,8 +35,9 @@ class GPTNeoXJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = False
from_pretrained_kwargs = {"do_clean_text": False, "add_prefix_space": False}
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
vocab_tokens = [
"こん",
@ -62,18 +64,22 @@ class GPTNeoXJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
"<|endoftext|>",
]
emoji_tokens = {"emoji": {"\ud83d\ude00": "<|emoji1|>"}, "emoji_inv": {"<|emoji1|>": "\ud83d\ude00"}} # 😀
self.special_tokens_map = {"unk_token": "<unk>"}
cls.special_tokens_map = {"unk_token": "<unk>"}
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
self.emoji_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["emoji_file"])
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
cls.emoji_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["emoji_file"])
with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
with open(self.emoji_file, "w") as emoji_writer:
with open(cls.emoji_file, "w") as emoji_writer:
emoji_writer.write(json.dumps(emoji_tokens))
def get_tokenizer(self, **kwargs):
kwargs.update(self.special_tokens_map)
return GPTNeoXJapaneseTokenizer.from_pretrained(self.tmpdirname, **kwargs)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return GPTNeoXJapaneseTokenizer.from_pretrained(pretrained_name, **kwargs)
def get_input_output_texts(self, tokenizer):
input_text = "こんにちは、世界。 \nこんばんは、㔺界。😀"

View File

@ -33,13 +33,14 @@ class GPTSw3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_sentencepiece = True
test_sentencepiece_ignore_case = False
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
# We have a SentencePiece fixture for testing
tokenizer = GPTSw3Tokenizer(SAMPLE_VOCAB, eos_token="<unk>", bos_token="<unk>", pad_token="<unk>")
tokenizer.save_pretrained(self.tmpdirname)
tokenizer.save_pretrained(cls.tmpdirname)
def get_input_output_texts(self, tokenizer):
input_text = "This is a test"

View File

@ -33,12 +33,13 @@ class HerbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
rust_tokenizer_class = HerbertTokenizerFast
test_rust_tokenizer = True
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
# Use a simpler test file without japanese/chinese characters
with open(f"{get_tests_dir()}/fixtures/sample_text_no_unicode.txt", encoding="utf-8") as f_data:
self._data = f_data.read().replace("\n\n", "\n").strip()
cls._data = f_data.read().replace("\n\n", "\n").strip()
vocab = [
"<s>",
@ -69,11 +70,11 @@ class HerbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""]
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(self.vocab_file, "w") as fp:
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(cls.vocab_file, "w") as fp:
fp.write(json.dumps(vocab_tokens))
with open(self.merges_file, "w") as fp:
with open(cls.merges_file, "w") as fp:
fp.write("\n".join(merges))
def get_input_output_texts(self, tokenizer):

View File

@ -16,12 +16,13 @@
import os
import unittest
from functools import lru_cache
from transformers import LayoutLMTokenizer, LayoutLMTokenizerFast
from transformers.models.layoutlm.tokenization_layoutlm import VOCAB_FILES_NAMES
from transformers.testing_utils import require_tokenizers
from ...test_tokenization_common import TokenizerTesterMixin
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
@require_tokenizers
@ -32,8 +33,9 @@ class LayoutLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = True
space_between_special_tokens = True
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
vocab_tokens = [
"[UNK]",
@ -50,12 +52,16 @@ class LayoutLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
"low",
"lowest",
]
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
def get_tokenizer(self, **kwargs):
return LayoutLMTokenizer.from_pretrained(self.tmpdirname, **kwargs)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs):
pretrained_name = pretrained_name or cls.tmpdirname
return LayoutLMTokenizer.from_pretrained(pretrained_name, **kwargs)
def get_input_output_texts(self, tokenizer):
input_text = "UNwant\u00e9d,running"

View File

@ -102,8 +102,9 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
return questions, words, boxes
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
vocab_tokens = [
"[UNK]",
@ -122,8 +123,8 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
"test",
"lowest",
]
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
def get_input_output_texts(self, tokenizer):
@ -267,7 +268,7 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_offsets_with_special_characters(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
words, boxes = self.get_words_and_boxes()
words[1] = tokenizer_r.mask_token
@ -605,8 +606,8 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_padding(self, max_length=50):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
pad_token_id = tokenizer_p.pad_token_id
@ -1060,7 +1061,7 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
# Input tokens id
words, boxes = self.get_words_and_boxes()
@ -1363,7 +1364,7 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
words, boxes = self.get_words_and_boxes()
@ -1417,7 +1418,7 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
words, boxes = self.get_words_and_boxes()
tokens_r = tokenizer_r.encode_plus(
words,
@ -1715,7 +1716,7 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
pad_token_id = tokenizer_p.pad_token_id

View File

@ -20,6 +20,7 @@ import re
import shutil
import tempfile
import unittest
from functools import lru_cache
from typing import List
from parameterized import parameterized
@ -41,7 +42,12 @@ from transformers.testing_utils import (
slow,
)
from ...test_tokenization_common import SMALL_TRAINING_CORPUS, TokenizerTesterMixin, merge_model_tokenizer_mappings
from ...test_tokenization_common import (
SMALL_TRAINING_CORPUS,
TokenizerTesterMixin,
merge_model_tokenizer_mappings,
use_cache_if_possible,
)
logger = logging.get_logger(__name__)
@ -91,8 +97,9 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
return questions, words, boxes
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
vocab = [
@ -119,22 +126,30 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
]
vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
self.special_tokens_map = {"unk_token": "<unk>"}
cls.special_tokens_map = {"unk_token": "<unk>"}
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(self.vocab_file, "w", encoding="utf-8") as fp:
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(cls.vocab_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(vocab_tokens) + "\n")
with open(self.merges_file, "w", encoding="utf-8") as fp:
with open(cls.merges_file, "w", encoding="utf-8") as fp:
fp.write("\n".join(merges))
def get_tokenizer(self, **kwargs):
kwargs.update(self.special_tokens_map)
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
def get_rust_tokenizer(self, **kwargs):
kwargs.update(self.special_tokens_map)
return LayoutLMv3TokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return LayoutLMv3TokenizerFast.from_pretrained(pretrained_name, **kwargs)
def get_input_output_texts(self, tokenizer):
input_text = "lower newer"
@ -485,8 +500,8 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_padding(self, max_length=50):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
pad_token_id = tokenizer_p.pad_token_id
@ -940,7 +955,7 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
# Input tokens id
words, boxes = self.get_words_and_boxes()
@ -1241,7 +1256,7 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
words, boxes = self.get_words_and_boxes()
@ -1295,7 +1310,7 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
words, boxes = self.get_words_and_boxes()
tokens_r = tokenizer_r.encode_plus(
words,
@ -1593,7 +1608,7 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
pad_token_id = tokenizer_p.pad_token_id

View File

@ -96,12 +96,13 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
return questions, words, boxes
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
# We have a SentencePiece fixture for testing
tokenizer = LayoutXLMTokenizer(SAMPLE_VOCAB, keep_accents=True)
tokenizer.save_pretrained(self.tmpdirname)
tokenizer.save_pretrained(cls.tmpdirname)
def get_input_output_texts(self, tokenizer):
input_text = "UNwant\u00e9d,running"
@ -157,7 +158,7 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
_, _, boxes = self.get_question_words_and_boxes()
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_rust = self.rust_tokenizer_class.from_pretrained(
tokenizer_rust = self.get_rust_tokenizer(
pretrained_name, additional_special_tokens=[special_token], split_special_tokens=True, **kwargs
)
tokenizer_py = self.tokenizer_class.from_pretrained(
@ -206,7 +207,7 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_offsets_with_special_characters(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
words, boxes = self.get_words_and_boxes()
words[1] = tokenizer_r.mask_token
@ -536,8 +537,8 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_padding(self, max_length=50):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
pad_token_id = tokenizer_p.pad_token_id
@ -990,8 +991,8 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
# Input tokens id
words, boxes = self.get_words_and_boxes()
@ -1292,7 +1293,7 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
words, boxes = self.get_words_and_boxes()
@ -1346,7 +1347,7 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
words, boxes = self.get_words_and_boxes()
tokens_r = tokenizer_r.encode_plus(
words,
@ -1644,7 +1645,7 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
pad_token_id = tokenizer_p.pad_token_id
@ -1743,7 +1744,7 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
tmpdirname2 = tempfile.mkdtemp()

View File

@ -14,13 +14,14 @@
import json
import os
import unittest
from functools import lru_cache
from transformers import BatchEncoding, LEDTokenizer, LEDTokenizerFast
from transformers.models.led.tokenization_led import VOCAB_FILES_NAMES
from transformers.testing_utils import require_tokenizers, require_torch
from transformers.utils import cached_property
from ...test_tokenization_common import TokenizerTesterMixin
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
@require_tokenizers
@ -30,8 +31,10 @@ class TestTokenizationLED(TokenizerTesterMixin, unittest.TestCase):
rust_tokenizer_class = LEDTokenizerFast
test_rust_tokenizer = True
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
vocab = [
"l",
"o",
@ -56,22 +59,30 @@ class TestTokenizationLED(TokenizerTesterMixin, unittest.TestCase):
]
vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
self.special_tokens_map = {"unk_token": "<unk>"}
cls.special_tokens_map = {"unk_token": "<unk>"}
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(self.vocab_file, "w", encoding="utf-8") as fp:
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(cls.vocab_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(vocab_tokens) + "\n")
with open(self.merges_file, "w", encoding="utf-8") as fp:
with open(cls.merges_file, "w", encoding="utf-8") as fp:
fp.write("\n".join(merges))
def get_tokenizer(self, **kwargs):
kwargs.update(self.special_tokens_map)
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
def get_rust_tokenizer(self, **kwargs):
kwargs.update(self.special_tokens_map)
return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return cls.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
def get_input_output_texts(self, tokenizer):
return "lower newer", "lower newer"
@ -161,8 +172,8 @@ class TestTokenizationLED(TokenizerTesterMixin, unittest.TestCase):
def test_embeded_special_tokens(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
sentence = "A, <mask> AllenNLP sentence."
tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)

View File

@ -60,13 +60,14 @@ class LlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_sentencepiece = True
from_pretrained_kwargs = {}
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
# We have a SentencePiece fixture for testing
tokenizer = LlamaTokenizer(SAMPLE_VOCAB, keep_accents=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.save_pretrained(self.tmpdirname)
tokenizer.save_pretrained(cls.tmpdirname)
def get_tokenizers(self, **kwargs):
kwargs.update({"pad_token": "<PAD>"})
@ -149,8 +150,8 @@ class LlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.tokenizers_list += (self.rust_tokenizer_class, "hf-internal-testing/llama-tokenizer", {})
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
tmpdirname2 = tempfile.mkdtemp()
@ -253,7 +254,7 @@ class LlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
added_tokens = [AddedToken("<special>", lstrip=True)]
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
tokenizer_r = self.get_rust_tokenizer(
pretrained_name, additional_special_tokens=added_tokens, **kwargs
)
r_output = tokenizer_r.encode("Hey this is a <special> token")
@ -263,7 +264,7 @@ class LlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertTrue(special_token_id in r_output)
if self.test_slow_tokenizer:
tokenizer_cr = self.rust_tokenizer_class.from_pretrained(
tokenizer_cr = self.get_rust_tokenizer(
pretrained_name,
additional_special_tokens=added_tokens,
**kwargs, # , from_slow=True <- unfortunately too slow to convert
@ -313,8 +314,8 @@ class LlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
EXPECTED_WITH_SPACE = [1, 18637, 920, 526, 366, 2599]
EXPECTED_WO_SPACE = [1, 29950, 1032, 920, 526, 366, 2599]
slow_ = self.tokenizer_class.from_pretrained(pretrained_name, add_prefix_space=False, legacy=False)
fast_ = self.rust_tokenizer_class.from_pretrained(pretrained_name, add_prefix_space=False, legacy=False)
slow_ = self.get_tokenizer(pretrained_name, add_prefix_space=False, legacy=False)
fast_ = self.get_rust_tokenizer(pretrained_name, add_prefix_space=False, legacy=False)
self.assertEqual(slow_.encode(inputs), EXPECTED_WO_SPACE)
self.assertEqual(slow_.encode(inputs), fast_.encode(inputs))
self.assertEqual(slow_.tokenize(inputs), ["H", "ey", "▁how", "▁are", "▁you", "▁doing"])
@ -324,8 +325,8 @@ class LlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
fast_.decode(EXPECTED_WO_SPACE, skip_special_tokens=True),
)
slow_ = self.tokenizer_class.from_pretrained(pretrained_name, add_prefix_space=True, legacy=False)
fast_ = self.rust_tokenizer_class.from_pretrained(pretrained_name, add_prefix_space=True, legacy=False)
slow_ = self.get_tokenizer(pretrained_name, add_prefix_space=True, legacy=False)
fast_ = self.get_rust_tokenizer(pretrained_name, add_prefix_space=True, legacy=False)
self.assertEqual(slow_.encode(inputs), EXPECTED_WITH_SPACE)
self.assertEqual(slow_.encode(inputs), fast_.encode(inputs))
self.assertEqual(slow_.tokenize(inputs), ["▁Hey", "▁how", "▁are", "▁you", "▁doing"])

View File

@ -18,12 +18,13 @@ import itertools
import json
import os
import unittest
from functools import lru_cache
from transformers import AddedToken, LongformerTokenizer, LongformerTokenizerFast
from transformers.models.longformer.tokenization_longformer import VOCAB_FILES_NAMES
from transformers.testing_utils import require_tokenizers, slow
from ...test_tokenization_common import TokenizerTesterMixin
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
@require_tokenizers
@ -36,8 +37,9 @@ class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
rust_tokenizer_class = LongformerTokenizerFast
test_rust_tokenizer = True
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
vocab = [
@ -64,22 +66,30 @@ class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
]
vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
self.special_tokens_map = {"unk_token": "<unk>"}
cls.special_tokens_map = {"unk_token": "<unk>"}
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(self.vocab_file, "w", encoding="utf-8") as fp:
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(cls.vocab_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(vocab_tokens) + "\n")
with open(self.merges_file, "w", encoding="utf-8") as fp:
with open(cls.merges_file, "w", encoding="utf-8") as fp:
fp.write("\n".join(merges))
def get_tokenizer(self, **kwargs):
kwargs.update(self.special_tokens_map)
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
def get_rust_tokenizer(self, **kwargs):
kwargs.update(self.special_tokens_map)
return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return cls.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
def get_input_output_texts(self, tokenizer):
input_text = "lower newer"
@ -173,8 +183,8 @@ class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_embeded_special_tokens(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
sentence = "A, <mask> AllenNLP sentence."
tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
@ -204,7 +214,7 @@ class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_change_add_prefix_space_and_trim_offsets_args(self):
for trim_offsets, add_prefix_space in itertools.product([True, False], repeat=2):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
tokenizer_r = self.get_rust_tokenizer(
self.tmpdirname, use_fast=True, add_prefix_space=add_prefix_space, trim_offsets=trim_offsets
)
@ -224,7 +234,7 @@ class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
text_of_1_token = "hello" # `hello` is a token in the vocabulary of `pretrained_name`
text = f"{text_of_1_token} {text_of_1_token}"
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
tokenizer_r = self.get_rust_tokenizer(
pretrained_name, use_fast=True, add_prefix_space=True, trim_offsets=True
)
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
@ -234,7 +244,7 @@ class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
(len(text_of_1_token) + 1, len(text_of_1_token) + 1 + len(text_of_1_token)),
)
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
tokenizer_r = self.get_rust_tokenizer(
pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=True
)
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
@ -244,7 +254,7 @@ class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
(len(text_of_1_token) + 1, len(text_of_1_token) + 1 + len(text_of_1_token)),
)
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
tokenizer_r = self.get_rust_tokenizer(
pretrained_name, use_fast=True, add_prefix_space=True, trim_offsets=False
)
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
@ -254,7 +264,7 @@ class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
(len(text_of_1_token), len(text_of_1_token) + 1 + len(text_of_1_token)),
)
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
tokenizer_r = self.get_rust_tokenizer(
pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=False
)
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
@ -276,7 +286,7 @@ class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
# (1 + len(text_of_1_token) + 1, 1 + len(text_of_1_token) + 1 + len(text_of_1_token)),
# )
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
tokenizer_r = self.get_rust_tokenizer(
pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=True
)
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
@ -286,7 +296,7 @@ class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
(1 + len(text_of_1_token) + 1, 1 + len(text_of_1_token) + 1 + len(text_of_1_token)),
)
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
tokenizer_r = self.get_rust_tokenizer(
pretrained_name, use_fast=True, add_prefix_space=True, trim_offsets=False
)
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
@ -296,7 +306,7 @@ class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
(1 + len(text_of_1_token), 1 + len(text_of_1_token) + 1 + len(text_of_1_token)),
)
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
tokenizer_r = self.get_rust_tokenizer(
pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=False
)
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)

View File

@ -14,12 +14,13 @@
# limitations under the License.
import unittest
from functools import lru_cache
from typing import Tuple
from transformers import AddedToken, LukeTokenizer
from transformers.testing_utils import get_tests_dir, require_torch, slow
from ...test_tokenization_common import TokenizerTesterMixin
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
SAMPLE_VOCAB = get_tests_dir("fixtures/vocab.json")
@ -33,13 +34,17 @@ class LukeTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = False
from_pretrained_kwargs = {"cls_token": "<s>"}
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
self.special_tokens_map = {"entity_token_1": "<ent>", "entity_token_2": "<ent2>"}
cls.special_tokens_map = {"entity_token_1": "<ent>", "entity_token_2": "<ent2>"}
def get_tokenizer(self, task=None, **kwargs):
kwargs.update(self.special_tokens_map)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, task=None, **kwargs):
kwargs.update(cls.special_tokens_map)
tokenizer = LukeTokenizer(
vocab_file=SAMPLE_VOCAB,
merges_file=SAMPLE_MERGE_FILE,
@ -137,8 +142,8 @@ class LukeTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
def test_embeded_special_tokens(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
sentence = "A, <mask> AllenNLP sentence."
tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)

View File

@ -32,8 +32,9 @@ class LxmertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = True
space_between_special_tokens = True
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
vocab_tokens = [
"[UNK]",
@ -50,8 +51,8 @@ class LxmertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
"low",
"lowest",
]
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
def get_input_output_texts(self, tokenizer):

View File

@ -14,6 +14,7 @@
import tempfile
import unittest
from functools import lru_cache
from pathlib import Path
from shutil import copyfile
@ -32,7 +33,7 @@ from transformers.utils import is_sentencepiece_available
if is_sentencepiece_available():
from transformers.models.m2m_100.tokenization_m2m_100 import VOCAB_FILES_NAMES, save_json
from ...test_tokenization_common import TokenizerTesterMixin
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
if is_sentencepiece_available():
@ -54,21 +55,26 @@ class M2M100TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_seq2seq = False
test_sentencepiece = True
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
vocab = ["</s>", "<unk>", "▁This", "▁is", "▁a", "▁t", "est", "\u0120", "<pad>"]
vocab_tokens = dict(zip(vocab, range(len(vocab))))
save_dir = Path(self.tmpdirname)
save_dir = Path(cls.tmpdirname)
save_json(vocab_tokens, save_dir / VOCAB_FILES_NAMES["vocab_file"])
if not (save_dir / VOCAB_FILES_NAMES["spm_file"]).exists():
copyfile(SAMPLE_SP, save_dir / VOCAB_FILES_NAMES["spm_file"])
tokenizer = M2M100Tokenizer.from_pretrained(self.tmpdirname)
tokenizer.save_pretrained(self.tmpdirname)
tokenizer = M2M100Tokenizer.from_pretrained(cls.tmpdirname)
tokenizer.save_pretrained(cls.tmpdirname)
def get_tokenizer(self, **kwargs):
return M2M100Tokenizer.from_pretrained(self.tmpdirname, **kwargs)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs):
pretrained_name = pretrained_name or cls.tmpdirname
return M2M100Tokenizer.from_pretrained(pretrained_name, **kwargs)
def get_input_output_texts(self, tokenizer):
return (

View File

@ -15,6 +15,7 @@
import tempfile
import unittest
from functools import lru_cache
from pathlib import Path
from shutil import copyfile
@ -26,7 +27,7 @@ from transformers.utils import is_sentencepiece_available, is_tf_available, is_t
if is_sentencepiece_available():
from transformers.models.marian.tokenization_marian import VOCAB_FILES_NAMES, save_json
from ...test_tokenization_common import TokenizerTesterMixin
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
SAMPLE_SP = get_tests_dir("fixtures/test_sentencepiece.model")
@ -50,22 +51,28 @@ class MarianTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = False
test_sentencepiece = True
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
vocab = ["</s>", "<unk>", "▁This", "▁is", "▁a", "▁t", "est", "\u0120", "<pad>"]
vocab_tokens = dict(zip(vocab, range(len(vocab))))
save_dir = Path(self.tmpdirname)
save_dir = Path(cls.tmpdirname)
save_json(vocab_tokens, save_dir / VOCAB_FILES_NAMES["vocab"])
save_json(mock_tokenizer_config, save_dir / VOCAB_FILES_NAMES["tokenizer_config_file"])
if not (save_dir / VOCAB_FILES_NAMES["source_spm"]).exists():
copyfile(SAMPLE_SP, save_dir / VOCAB_FILES_NAMES["source_spm"])
copyfile(SAMPLE_SP, save_dir / VOCAB_FILES_NAMES["target_spm"])
tokenizer = MarianTokenizer.from_pretrained(self.tmpdirname)
tokenizer.save_pretrained(self.tmpdirname)
tokenizer = MarianTokenizer.from_pretrained(cls.tmpdirname)
tokenizer.save_pretrained(cls.tmpdirname)
def get_tokenizer(self, **kwargs) -> MarianTokenizer:
return MarianTokenizer.from_pretrained(self.tmpdirname, **kwargs)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs) -> MarianTokenizer:
pretrained_name = pretrained_name or cls.tmpdirname
return MarianTokenizer.from_pretrained(pretrained_name, **kwargs)
def get_input_output_texts(self, tokenizer):
return (

View File

@ -50,26 +50,27 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_kwargs = {"cls_token": "<s>"}
test_seq2seq = False
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "\u0120", "\u0120l", "\u0120n", "\u0120lo", "\u0120low", "er", "\u0120lowest", "\u0120newer", "\u0120wider", "\u0120hello", "\u0120world", "<unk>",] # fmt: skip
vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
self.tags_dict = {"a": 0, "abbr": 1, "acronym": 2, "address": 3}
self.special_tokens_map = {"unk_token": "<unk>"}
cls.tags_dict = {"a": 0, "abbr": 1, "acronym": 2, "address": 3}
cls.special_tokens_map = {"unk_token": "<unk>"}
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
self.tokenizer_config_file = os.path.join(self.tmpdirname, "tokenizer_config.json")
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
cls.tokenizer_config_file = os.path.join(cls.tmpdirname, "tokenizer_config.json")
with open(self.vocab_file, "w", encoding="utf-8") as fp:
with open(cls.vocab_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(vocab_tokens) + "\n")
with open(self.merges_file, "w", encoding="utf-8") as fp:
with open(cls.merges_file, "w", encoding="utf-8") as fp:
fp.write("\n".join(merges))
with open(self.tokenizer_config_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps({"tags_dict": self.tags_dict}))
with open(cls.tokenizer_config_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps({"tags_dict": cls.tags_dict}))
def get_nodes_and_xpaths(self):
nodes = ["hello", "world"]
@ -421,8 +422,8 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_padding(self, max_length=50):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
pad_token_id = tokenizer_p.pad_token_id
@ -828,8 +829,8 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
# Input tokens id
nodes, xpaths = self.get_nodes_and_xpaths()
@ -1010,7 +1011,7 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_offsets_mapping(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
text = ["a", "wonderful", "test"]
xpaths = ["html/body" for _ in range(len(text))]
@ -1125,7 +1126,7 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
nodes, xpaths = self.get_nodes_and_xpaths()
@ -1187,7 +1188,7 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
nodes, xpaths = self.get_nodes_and_xpaths()
tokens_r = tokenizer_r.encode_plus(nodes, xpaths=xpaths, add_special_tokens=True)
tokens_p = tokenizer_p.encode_plus(nodes, xpaths=xpaths, add_special_tokens=True)
@ -1490,7 +1491,7 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
pad_token_id = tokenizer_p.pad_token_id

View File

@ -47,12 +47,13 @@ class MBartTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = True
test_sentencepiece = True
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
# We have a SentencePiece fixture for testing
tokenizer = MBartTokenizer(SAMPLE_VOCAB, keep_accents=True)
tokenizer.save_pretrained(self.tmpdirname)
tokenizer.save_pretrained(cls.tmpdirname)
def test_full_tokenizer(self):
tokenizer = MBartTokenizer(SAMPLE_VOCAB, keep_accents=True)
@ -139,8 +140,8 @@ class MBartTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.tokenizers_list[0] = (self.rust_tokenizer_class, "hf-internal-testing/tiny-random-mbart", {})
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
tmpdirname2 = tempfile.mkdtemp()

View File

@ -47,12 +47,13 @@ class MBart50TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = True
test_sentencepiece = True
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
# We have a SentencePiece fixture for testing
tokenizer = MBart50Tokenizer(SAMPLE_VOCAB, src_lang="en_XX", tgt_lang="ro_RO", keep_accents=True)
tokenizer.save_pretrained(self.tmpdirname)
tokenizer.save_pretrained(cls.tmpdirname)
def test_convert_token_and_id(self):
"""Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
@ -117,8 +118,8 @@ class MBart50TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.tokenizers_list[0] = (self.rust_tokenizer_class, "hf-internal-testing/tiny-random-mbart50", {})
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
tmpdirname2 = tempfile.mkdtemp()

View File

@ -17,12 +17,13 @@
import json
import os
import unittest
from functools import lru_cache
from transformers import MgpstrTokenizer
from transformers.models.mgp_str.tokenization_mgp_str import VOCAB_FILES_NAMES
from transformers.testing_utils import require_tokenizers
from ...test_tokenization_common import TokenizerTesterMixin
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
@require_tokenizers
@ -33,18 +34,23 @@ class MgpstrTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_kwargs = {}
test_seq2seq = False
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
vocab = ['[GO]', '[s]', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'] # fmt: skip
vocab_tokens = dict(zip(vocab, range(len(vocab))))
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(self.vocab_file, "w", encoding="utf-8") as fp:
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(cls.vocab_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(vocab_tokens) + "\n")
def get_tokenizer(self, **kwargs):
return MgpstrTokenizer.from_pretrained(self.tmpdirname, **kwargs)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs):
pretrained_name = pretrained_name or cls.tmpdirname
return MgpstrTokenizer.from_pretrained(pretrained_name, **kwargs)
def get_input_output_texts(self, tokenizer):
input_text = "tester"

View File

@ -15,12 +15,13 @@
import unittest
from functools import lru_cache
from typing import Tuple
from transformers.models.mluke.tokenization_mluke import MLukeTokenizer
from transformers.testing_utils import get_tests_dir, require_torch, slow
from ...test_tokenization_common import TokenizerTesterMixin
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
@ -33,13 +34,17 @@ class MLukeTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = False
from_pretrained_kwargs = {"cls_token": "<s>"}
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
self.special_tokens_map = {"entity_token_1": "<ent>", "entity_token_2": "<ent2>"}
cls.special_tokens_map = {"entity_token_1": "<ent>", "entity_token_2": "<ent2>"}
def get_tokenizer(self, task=None, **kwargs):
kwargs.update(self.special_tokens_map)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, task=None, **kwargs):
kwargs.update(cls.special_tokens_map)
kwargs.update({"task": task})
tokenizer = MLukeTokenizer(vocab_file=SAMPLE_VOCAB, entity_vocab_file=SAMPLE_ENTITY_VOCAB, **kwargs)
return tokenizer
@ -100,8 +105,8 @@ class MLukeTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
def test_embeded_special_tokens(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
sentence = "A, <mask> AllenNLP sentence."
tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)

View File

@ -41,8 +41,9 @@ class MobileBERTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_filter = filter_non_english
pre_trained_model_path = "google/mobilebert-uncased"
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
vocab_tokens = [
"[UNK]",
@ -61,13 +62,13 @@ class MobileBERTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
"low",
"lowest",
]
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
self.tokenizers_list = [
(tokenizer_def[0], self.pre_trained_model_path, tokenizer_def[2]) # else the 'google/' prefix is stripped
for tokenizer_def in self.tokenizers_list
cls.tokenizers_list = [
(tokenizer_def[0], cls.pre_trained_model_path, tokenizer_def[2]) # else the 'google/' prefix is stripped
for tokenizer_def in cls.tokenizers_list
]
# Copied from tests.models.bert.test_tokenization_bert.BertTokenizationTest.get_input_output_texts
@ -275,7 +276,7 @@ class MobileBERTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_offsets_with_special_characters(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence."
tokens = tokenizer_r.encode_plus(
@ -331,8 +332,8 @@ class MobileBERTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
kwargs["tokenize_chinese_chars"] = True
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)
ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
@ -345,8 +346,8 @@ class MobileBERTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertListEqual(tokens_without_spe_char_r, list_of_commun_chinese_char)
kwargs["tokenize_chinese_chars"] = False
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)

View File

@ -51,8 +51,9 @@ class MoshiTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = True
from_pretrained_kwargs = {}
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
# We have a SentencePiece fixture for testing
tokenizer = PreTrainedTokenizerFast(
@ -62,10 +63,11 @@ class MoshiTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
eos_token="</s>",
)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.save_pretrained(self.tmpdirname)
tokenizer.save_pretrained(cls.tmpdirname)
def get_rust_tokenizer(self, **kwargs) -> PreTrainedTokenizerFast:
return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
def get_rust_tokenizer(cls, pretrained_name=None, **kwargs) -> PreTrainedTokenizerFast:
pretrained_name = pretrained_name or cls.tmpdirname
return cls.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
@unittest.skip(reason="No slow tokenizer")
def test_added_tokens_serialization(self):

View File

@ -32,8 +32,9 @@ class MPNetTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = True
space_between_special_tokens = True
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
vocab_tokens = [
"[UNK]",
@ -52,8 +53,8 @@ class MPNetTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
"low",
"lowest",
]
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
def get_input_output_texts(self, tokenizer):

View File

@ -14,13 +14,14 @@
import json
import os
import unittest
from functools import lru_cache
from transformers import BatchEncoding, MvpTokenizer, MvpTokenizerFast
from transformers.models.roberta.tokenization_roberta import VOCAB_FILES_NAMES
from transformers.testing_utils import require_tokenizers, require_torch
from transformers.utils import cached_property
from ...test_tokenization_common import TokenizerTesterMixin, filter_roberta_detectors
from ...test_tokenization_common import TokenizerTesterMixin, filter_roberta_detectors, use_cache_if_possible
@require_tokenizers
@ -32,8 +33,10 @@ class TestTokenizationMvp(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_filter = filter_roberta_detectors
# from_pretrained_kwargs = {'add_prefix_space': True}
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
vocab = [
"l",
"o",
@ -58,22 +61,30 @@ class TestTokenizationMvp(TokenizerTesterMixin, unittest.TestCase):
]
vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
self.special_tokens_map = {"unk_token": "<unk>"}
cls.special_tokens_map = {"unk_token": "<unk>"}
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(self.vocab_file, "w", encoding="utf-8") as fp:
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(cls.vocab_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(vocab_tokens) + "\n")
with open(self.merges_file, "w", encoding="utf-8") as fp:
with open(cls.merges_file, "w", encoding="utf-8") as fp:
fp.write("\n".join(merges))
def get_tokenizer(self, **kwargs):
kwargs.update(self.special_tokens_map)
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
def get_rust_tokenizer(self, **kwargs):
kwargs.update(self.special_tokens_map)
return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return cls.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
def get_input_output_texts(self, tokenizer):
return "lower newer", "lower newer"
@ -153,8 +164,8 @@ class TestTokenizationMvp(TokenizerTesterMixin, unittest.TestCase):
def test_embeded_special_tokens(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
sentence = "A, <mask> AllenNLP sentence."
tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)

View File

@ -16,6 +16,7 @@ import binascii
import unittest
from transformers import MyT5Tokenizer
from transformers.testing_utils import slow
from transformers.utils import is_tf_available, is_torch_available
from ...test_tokenization_common import TokenizerTesterMixin
@ -86,15 +87,14 @@ class TestByteRewriter(unittest.TestCase):
self.assertEqual(decompose_rewriter.rewrite_bytes(in_hex), out_hex)
# This is way too slow, let's not run it on CircleCI. When trying to use cache, we get OOM and worker(s) crashed.
@slow
class MyT5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = MyT5Tokenizer
test_rust_tokenizer = False
def setUp(self):
super().setUp()
def get_tokenizer(self, **kwargs) -> MyT5Tokenizer:
return self.tokenizer_class.from_pretrained("Tomlim/myt5-base", **kwargs)
def get_tokenizer(cls, **kwargs) -> MyT5Tokenizer:
return cls.tokenizer_class.from_pretrained("Tomlim/myt5-base", **kwargs)
@unittest.skip(reason="inputs cannot be pretokenized as ids depend on whole input string")
def test_pretokenized_inputs(self):

View File

@ -56,12 +56,13 @@ class NllbTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_sentencepiece = True
from_pretrained_kwargs = {}
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
# We have a SentencePiece fixture for testing
tokenizer = NllbTokenizer(SAMPLE_VOCAB, keep_accents=True)
tokenizer.save_pretrained(self.tmpdirname)
tokenizer.save_pretrained(cls.tmpdirname)
def test_full_tokenizer(self):
tokenizer = NllbTokenizer(SAMPLE_VOCAB, keep_accents=True)
@ -143,8 +144,8 @@ class NllbTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.tokenizers_list[0] = (self.rust_tokenizer_class, "hf-internal-testing/tiny-random-nllb", {})
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
tmpdirname2 = tempfile.mkdtemp()
@ -262,7 +263,7 @@ class NllbTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
added_tokens = [AddedToken("<special>", lstrip=True)]
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
tokenizer_r = self.get_rust_tokenizer(
pretrained_name, additional_special_tokens=added_tokens, **kwargs
)
r_output = tokenizer_r.encode("Hey this is a <special> token")
@ -272,7 +273,7 @@ class NllbTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertTrue(special_token_id in r_output)
if self.test_slow_tokenizer:
tokenizer_cr = self.rust_tokenizer_class.from_pretrained(
tokenizer_cr = self.get_rust_tokenizer(
pretrained_name,
additional_special_tokens=added_tokens,
**kwargs, # , from_slow=True <- unfortunately too slow to convert

View File

@ -13,13 +13,15 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
import unittest
from functools import lru_cache
from transformers import NougatTokenizerFast
from transformers.models.nougat.tokenization_nougat_fast import markdown_compatible, normalize_list_like_lines
from transformers.testing_utils import require_levenshtein, require_nltk, require_tokenizers
from ...test_tokenization_common import TokenizerTesterMixin
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
@require_tokenizers
@ -33,19 +35,26 @@ class NougatTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_vocab_key = "tokenizer_file"
special_tokens_map = {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
tokenizer = NougatTokenizerFast.from_pretrained("facebook/nougat-base")
tokenizer.save_pretrained(self.tmpdirname)
tokenizer.save_pretrained(cls.tmpdirname)
def get_rust_tokenizer(self, **kwargs):
kwargs.update(self.special_tokens_map)
return NougatTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
_kwargs = copy.deepcopy(cls.special_tokens_map)
_kwargs.update(kwargs)
kwargs = _kwargs
pretrained_name = pretrained_name or cls.tmpdirname
return NougatTokenizerFast.from_pretrained(pretrained_name, **kwargs)
def test_padding(self, max_length=6):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
# Simple input
sentence1 = "This is a simple input"
sentence2 = ["This is a simple input 1", "This is a simple input 2"]

View File

@ -35,8 +35,9 @@ class OpenAIGPTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = True
test_seq2seq = False
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
vocab = [
@ -65,11 +66,11 @@ class OpenAIGPTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["#version: 0.2", "l o", "lo w", "e r</w>", ""]
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(self.vocab_file, "w") as fp:
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(cls.vocab_file, "w") as fp:
fp.write(json.dumps(vocab_tokens))
with open(self.merges_file, "w") as fp:
with open(cls.merges_file, "w") as fp:
fp.write("\n".join(merges))
def get_input_output_texts(self, tokenizer):
@ -90,7 +91,7 @@ class OpenAIGPTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_padding(self, max_length=15):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
# Simple input
s = "This is a simple input"

View File

@ -13,12 +13,13 @@
# limitations under the License.
import unittest
from functools import lru_cache
from transformers import PegasusTokenizer, PegasusTokenizerFast
from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, require_torch, slow
from transformers.utils import cached_property
from ...test_tokenization_common import TokenizerTesterMixin
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece_no_bos.model")
@ -33,19 +34,24 @@ class PegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = True
test_sentencepiece = True
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
# We have a SentencePiece fixture for testing
tokenizer = PegasusTokenizer(SAMPLE_VOCAB)
tokenizer.save_pretrained(self.tmpdirname)
tokenizer.save_pretrained(cls.tmpdirname)
@cached_property
def _large_tokenizer(self):
return PegasusTokenizer.from_pretrained("google/pegasus-large")
def get_tokenizer(self, **kwargs) -> PegasusTokenizer:
return PegasusTokenizer.from_pretrained(self.tmpdirname, **kwargs)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs) -> PegasusTokenizer:
pretrained_name = pretrained_name or cls.tmpdirname
return PegasusTokenizer.from_pretrained(pretrained_name, **kwargs)
def get_input_output_texts(self, tokenizer):
return ("This is a test", "This is a test")
@ -70,8 +76,8 @@ class PegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertEqual(self.get_tokenizer().vocab_size, 1_103)
def test_mask_tokens_rust_pegasus(self):
rust_tokenizer = self.rust_tokenizer_class.from_pretrained(self.tmpdirname)
py_tokenizer = self.tokenizer_class.from_pretrained(self.tmpdirname)
rust_tokenizer = self.get_rust_tokenizer(self.tmpdirname)
py_tokenizer = self.get_tokenizer(self.tmpdirname)
raw_input_str = (
"Let's see which <unk> is the better <unk_token_11> one <mask_1> It seems like this <mask_2> was important"
" </s> <pad> <pad> <pad>"
@ -138,26 +144,31 @@ class BigBirdPegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = True
test_sentencepiece = True
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
# We have a SentencePiece fixture for testing
tokenizer = PegasusTokenizer(SAMPLE_VOCAB, offset=0, mask_token_sent=None, mask_token="[MASK]")
tokenizer.save_pretrained(self.tmpdirname)
tokenizer.save_pretrained(cls.tmpdirname)
@cached_property
def _large_tokenizer(self):
return PegasusTokenizer.from_pretrained("google/bigbird-pegasus-large-arxiv")
def get_tokenizer(self, **kwargs) -> PegasusTokenizer:
return PegasusTokenizer.from_pretrained(self.tmpdirname, **kwargs)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs) -> PegasusTokenizer:
pretrained_name = pretrained_name or cls.tmpdirname
return PegasusTokenizer.from_pretrained(pretrained_name, **kwargs)
def get_input_output_texts(self, tokenizer):
return ("This is a test", "This is a test")
def test_mask_tokens_rust_pegasus(self):
rust_tokenizer = self.rust_tokenizer_class.from_pretrained(self.tmpdirname)
py_tokenizer = self.tokenizer_class.from_pretrained(self.tmpdirname)
rust_tokenizer = self.get_rust_tokenizer(self.tmpdirname)
py_tokenizer = self.get_tokenizer(self.tmpdirname)
raw_input_str = (
"Let's see which <unk> is the better <unk_token> one [MASK] It seems like this [MASK] was important </s>"
" <pad> <pad> <pad>"

View File

@ -19,12 +19,13 @@ import re
import shutil
import tempfile
import unittest
from functools import lru_cache
from typing import Tuple
from transformers import AddedToken, BatchEncoding, PerceiverTokenizer
from transformers.utils import cached_property, is_tf_available, is_torch_available
from ...test_tokenization_common import TokenizerTesterMixin
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
if is_torch_available():
@ -40,17 +41,22 @@ class PerceiverTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = PerceiverTokenizer
test_rust_tokenizer = False
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
tokenizer = PerceiverTokenizer()
tokenizer.save_pretrained(self.tmpdirname)
tokenizer.save_pretrained(cls.tmpdirname)
@cached_property
def perceiver_tokenizer(self):
return PerceiverTokenizer.from_pretrained("deepmind/language-perceiver")
def get_tokenizer(self, **kwargs) -> PerceiverTokenizer:
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs) -> PerceiverTokenizer:
pretrained_name = pretrained_name or cls.tmpdirname
return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
def get_clean_sequence(self, tokenizer, with_prefix_space=False, max_length=20, min_length=5) -> Tuple[str, list]:
# XXX The default common tokenizer tests assume that every ID is decodable on its own.

View File

@ -15,10 +15,11 @@
import os
import unittest
from functools import lru_cache
from transformers.models.phobert.tokenization_phobert import VOCAB_FILES_NAMES, PhobertTokenizer
from ...test_tokenization_common import TokenizerTesterMixin
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
class PhobertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
@ -26,27 +27,32 @@ class PhobertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = PhobertTokenizer
test_rust_tokenizer = False
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
vocab = ["T@@", "i", "I", "R@@", "r", "e@@"]
vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["#version: 0.2", "l à</w>"]
self.special_tokens_map = {"unk_token": "<unk>"}
cls.special_tokens_map = {"unk_token": "<unk>"}
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(self.vocab_file, "w", encoding="utf-8") as fp:
with open(cls.vocab_file, "w", encoding="utf-8") as fp:
for token in vocab_tokens:
fp.write(f"{token} {vocab_tokens[token]}\n")
with open(self.merges_file, "w", encoding="utf-8") as fp:
with open(cls.merges_file, "w", encoding="utf-8") as fp:
fp.write("\n".join(merges))
def get_tokenizer(self, **kwargs):
kwargs.update(self.special_tokens_map)
return PhobertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return PhobertTokenizer.from_pretrained(pretrained_name, **kwargs)
def get_input_output_texts(self, tokenizer):
input_text = "Tôi là VinAI Research"

View File

@ -45,12 +45,13 @@ class PLBartTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
rust_tokenizer_class = None
test_rust_tokenizer = False
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
# We have a SentencePiece fixture for testing
tokenizer = PLBartTokenizer(SAMPLE_VOCAB, language_codes="base", keep_accents=True)
tokenizer.save_pretrained(self.tmpdirname)
tokenizer.save_pretrained(cls.tmpdirname)
def test_full_base_tokenizer(self):
tokenizer = PLBartTokenizer(SAMPLE_VOCAB, language_codes="base", keep_accents=True)

View File

@ -36,8 +36,9 @@ class ProphetNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = ProphetNetTokenizer
test_rust_tokenizer = False
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
vocab_tokens = [
"[UNK]",
@ -56,8 +57,8 @@ class ProphetNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
"low",
"lowest",
]
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
def get_input_output_texts(self, tokenizer):

View File

@ -14,15 +14,17 @@
# limitations under the License.
import copy
import json
import os
import unittest
from functools import lru_cache
from transformers import AddedToken, Qwen2Tokenizer, Qwen2TokenizerFast
from transformers.models.qwen2.tokenization_qwen2 import VOCAB_FILES_NAMES, bytes_to_unicode
from transformers.testing_utils import require_tokenizers, slow
from ...test_tokenization_common import TokenizerTesterMixin
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
@require_tokenizers
@ -36,8 +38,9 @@ class Qwen2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_kwargs = None
test_seq2seq = False
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
# this make sure the vocabuary is complete at the byte level.
vocab = list(bytes_to_unicode().values())
@ -81,22 +84,34 @@ class Qwen2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
"# #",
]
self.special_tokens_map = {"eos_token": "<|endoftext|>"}
cls.special_tokens_map = {"eos_token": "<|endoftext|>"}
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(self.vocab_file, "w", encoding="utf-8") as fp:
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(cls.vocab_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(vocab_tokens) + "\n")
with open(self.merges_file, "w", encoding="utf-8") as fp:
with open(cls.merges_file, "w", encoding="utf-8") as fp:
fp.write("\n".join(merges))
def get_tokenizer(self, **kwargs):
kwargs.update(self.special_tokens_map)
return Qwen2Tokenizer.from_pretrained(self.tmpdirname, **kwargs)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs):
_kwargs = copy.deepcopy(cls.special_tokens_map)
_kwargs.update(kwargs)
kwargs = _kwargs
pretrained_name = pretrained_name or cls.tmpdirname
return Qwen2Tokenizer.from_pretrained(pretrained_name, **kwargs)
def get_rust_tokenizer(self, **kwargs):
kwargs.update(self.special_tokens_map)
return Qwen2TokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
_kwargs = copy.deepcopy(cls.special_tokens_map)
_kwargs.update(kwargs)
kwargs = _kwargs
pretrained_name = pretrained_name or cls.tmpdirname
return Qwen2TokenizerFast.from_pretrained(pretrained_name, **kwargs)
def get_input_output_texts(self, tokenizer):
# this case should cover

View File

@ -34,11 +34,12 @@ class ReformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_seq2seq = False
test_sentencepiece = True
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
tokenizer = ReformerTokenizer(SAMPLE_VOCAB, keep_accents=True)
tokenizer.save_pretrained(self.tmpdirname)
tokenizer.save_pretrained(cls.tmpdirname)
def test_convert_token_and_id(self):
"""Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
@ -84,7 +85,7 @@ class ReformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_padding(self, max_length=15):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
# Simple input
s = "This is a simple input"

View File

@ -39,11 +39,12 @@ class RemBertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_sentencepiece_ignore_case = True
pre_trained_model_path = "google/rembert"
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
tokenizer = RemBertTokenizer(SAMPLE_VOCAB)
tokenizer.save_pretrained(self.tmpdirname)
tokenizer.save_pretrained(cls.tmpdirname)
# Copied from ReformerTokenizationTest.get_input_output_texts
def get_input_output_texts(self, tokenizer):
@ -222,7 +223,7 @@ class RemBertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
with self.subTest("Hub -> Fast: Test loading a fast tokenizer from the hub)"):
if self.rust_tokenizer_class is not None:
tokenizer_fast = self.rust_tokenizer_class.from_pretrained(pretrained_name, eos_token=new_eos)
tokenizer_fast = self.get_rust_tokenizer(pretrained_name, eos_token=new_eos)
self.assertEqual(tokenizer_fast._special_tokens_map["eos_token"], new_eos)
self.assertIn(new_eos, list(tokenizer_fast.added_tokens_decoder.values()))
# We can't test the following because for BC we kept the default rstrip lstrip in slow not fast. Will comment once normalization is alright

View File

@ -18,12 +18,13 @@ import itertools
import json
import os
import unittest
from functools import lru_cache
from transformers import AddedToken, RobertaTokenizer, RobertaTokenizerFast
from transformers.models.roberta.tokenization_roberta import VOCAB_FILES_NAMES
from transformers.testing_utils import require_tokenizers, slow
from ...test_tokenization_common import TokenizerTesterMixin
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
@require_tokenizers
@ -34,8 +35,9 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = True
from_pretrained_kwargs = {"cls_token": "<s>"}
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
vocab = [
@ -62,22 +64,30 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
]
vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
self.special_tokens_map = {"unk_token": "<unk>"}
cls.special_tokens_map = {"unk_token": "<unk>"}
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(self.vocab_file, "w", encoding="utf-8") as fp:
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(cls.vocab_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(vocab_tokens) + "\n")
with open(self.merges_file, "w", encoding="utf-8") as fp:
with open(cls.merges_file, "w", encoding="utf-8") as fp:
fp.write("\n".join(merges))
def get_tokenizer(self, **kwargs):
kwargs.update(self.special_tokens_map)
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
def get_rust_tokenizer(self, **kwargs):
kwargs.update(self.special_tokens_map)
return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return cls.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
def get_input_output_texts(self, tokenizer):
input_text = "lower newer"
@ -171,8 +181,8 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_embeded_special_tokens(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
sentence = "A, <mask> AllenNLP sentence."
tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
@ -202,7 +212,7 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_change_add_prefix_space_and_trim_offsets_args(self):
for trim_offsets, add_prefix_space in itertools.product([True, False], repeat=2):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
tokenizer_r = self.get_rust_tokenizer(
self.tmpdirname, use_fast=True, add_prefix_space=add_prefix_space, trim_offsets=trim_offsets
)
@ -222,7 +232,7 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
text_of_1_token = "hello" # `hello` is a token in the vocabulary of `pretrained_name`
text = f"{text_of_1_token} {text_of_1_token}"
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
tokenizer_r = self.get_rust_tokenizer(
pretrained_name, use_fast=True, add_prefix_space=True, trim_offsets=True
)
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
@ -232,7 +242,7 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
(len(text_of_1_token) + 1, len(text_of_1_token) + 1 + len(text_of_1_token)),
)
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
tokenizer_r = self.get_rust_tokenizer(
pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=True
)
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
@ -242,7 +252,7 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
(len(text_of_1_token) + 1, len(text_of_1_token) + 1 + len(text_of_1_token)),
)
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
tokenizer_r = self.get_rust_tokenizer(
pretrained_name, use_fast=True, add_prefix_space=True, trim_offsets=False
)
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
@ -252,7 +262,7 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
(len(text_of_1_token), len(text_of_1_token) + 1 + len(text_of_1_token)),
)
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
tokenizer_r = self.get_rust_tokenizer(
pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=False
)
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
@ -274,7 +284,7 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
# (1 + len(text_of_1_token) + 1, 1 + len(text_of_1_token) + 1 + len(text_of_1_token)),
# )
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
tokenizer_r = self.get_rust_tokenizer(
pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=True
)
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
@ -284,7 +294,7 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
(1 + len(text_of_1_token) + 1, 1 + len(text_of_1_token) + 1 + len(text_of_1_token)),
)
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
tokenizer_r = self.get_rust_tokenizer(
pretrained_name, use_fast=True, add_prefix_space=True, trim_offsets=False
)
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
@ -294,7 +304,7 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
(1 + len(text_of_1_token), 1 + len(text_of_1_token) + 1 + len(text_of_1_token)),
)
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
tokenizer_r = self.get_rust_tokenizer(
pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=False
)
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)

View File

@ -41,8 +41,9 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
space_between_special_tokens = True
from_pretrained_filter = filter_non_english
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]", "", "", "", "", "a", "b", "c", "d"]
word_shape = {}
@ -50,14 +51,14 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
for i, value in enumerate(vocab_tokens):
word_shape[value] = i
word_pronunciation[value] = i
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
self.word_shape_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["word_shape_file"])
self.word_pronunciation_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["word_pronunciation_file"])
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
cls.word_shape_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["word_shape_file"])
cls.word_pronunciation_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["word_pronunciation_file"])
with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
with open(self.word_shape_file, "w", encoding="utf-8") as word_shape_writer:
with open(cls.word_shape_file, "w", encoding="utf-8") as word_shape_writer:
json.dump(word_shape, word_shape_writer, ensure_ascii=False)
with open(self.word_pronunciation_file, "w", encoding="utf-8") as word_pronunciation_writer:
with open(cls.word_pronunciation_file, "w", encoding="utf-8") as word_pronunciation_writer:
json.dump(word_pronunciation, word_pronunciation_writer, ensure_ascii=False)
def test_full_tokenizer(self):
@ -204,7 +205,7 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_offsets_with_special_characters(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence."
tokens = tokenizer_r.encode_plus(
@ -260,8 +261,8 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
kwargs["tokenize_chinese_chars"] = True
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)
ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
@ -274,8 +275,8 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertListEqual(tokens_without_spe_char_r, list_of_commun_chinese_char)
kwargs["tokenize_chinese_chars"] = False
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)

View File

@ -15,11 +15,12 @@
import tempfile
import unittest
from functools import lru_cache
from transformers import RoFormerTokenizer, RoFormerTokenizerFast
from transformers.testing_utils import require_rjieba, require_tokenizers
from ...test_tokenization_common import TokenizerTesterMixin
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
@require_rjieba
@ -31,14 +32,25 @@ class RoFormerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
space_between_special_tokens = True
test_rust_tokenizer = True
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
tokenizer = cls.tokenizer_class.from_pretrained("junnyu/roformer_chinese_base")
tokenizer.save_pretrained(cls.tmpdirname)
def get_tokenizer(self, **kwargs):
return self.tokenizer_class.from_pretrained("junnyu/roformer_chinese_base", **kwargs)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs):
pretrained_name = pretrained_name or cls.tmpdirname
return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
def get_rust_tokenizer(self, **kwargs):
return self.rust_tokenizer_class.from_pretrained("junnyu/roformer_chinese_base", **kwargs)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
pretrained_name = pretrained_name or cls.tmpdirname
return cls.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
def get_chinese_input_output_texts(self):
input_text = "永和服装饰品有限公司,今天天气非常好"

View File

@ -59,12 +59,13 @@ class SeamlessM4TTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_sentencepiece = True
from_pretrained_kwargs = {}
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
# We have a SentencePiece fixture for testing
tokenizer = SeamlessM4TTokenizer(SAMPLE_VOCAB, keep_accents=True)
tokenizer.save_pretrained(self.tmpdirname)
tokenizer.save_pretrained(cls.tmpdirname)
def test_full_tokenizer(self):
tokenizer = SeamlessM4TTokenizer(SAMPLE_VOCAB, keep_accents=True)
@ -353,7 +354,7 @@ class SeamlessM4TTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
added_tokens = [AddedToken("<special>", lstrip=True)]
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
tokenizer_r = self.get_rust_tokenizer(
pretrained_name, additional_special_tokens=added_tokens, **kwargs
)
r_output = tokenizer_r.encode("Hey this is a <special> token")
@ -363,7 +364,7 @@ class SeamlessM4TTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertTrue(special_token_id in r_output)
if self.test_slow_tokenizer:
tokenizer_cr = self.rust_tokenizer_class.from_pretrained(
tokenizer_cr = self.get_rust_tokenizer(
pretrained_name,
additional_special_tokens=added_tokens,
**kwargs, # , from_slow=True <- unfortunately too slow to convert

View File

@ -17,12 +17,13 @@ import json
import os
import tempfile
import unittest
from functools import lru_cache
from transformers import SPIECE_UNDERLINE, AddedToken, BatchEncoding, SiglipTokenizer
from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, slow
from transformers.utils import cached_property, is_tf_available, is_torch_available
from ...test_tokenization_common import TokenizerTesterMixin
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
@ -44,13 +45,13 @@ class SiglipTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_sentencepiece = True
test_sentencepiece_ignore_case = True
# Copied from tests.models.t5.test_tokenization_t5.T5TokenizationTest.setUp with T5->Siglip
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
# We have a SentencePiece fixture for testing
tokenizer = SiglipTokenizer(SAMPLE_VOCAB)
tokenizer.save_pretrained(self.tmpdirname)
tokenizer.save_pretrained(cls.tmpdirname)
# Copied from tests.models.t5.test_tokenization_t5.T5TokenizationTest.test_convert_token_and_id with T5->Siglip
def test_convert_token_and_id(self):
@ -135,9 +136,12 @@ class SiglipTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def siglip_tokenizer(self):
return SiglipTokenizer.from_pretrained("google/siglip-base-patch16-224")
# Copied from tests.models.t5.test_tokenization_t5.T5TokenizationTest.get_tokenizer with T5->Siglip
def get_tokenizer(self, **kwargs) -> SiglipTokenizer:
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs) -> SiglipTokenizer:
pretrained_name = pretrained_name or cls.tmpdirname
return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
# Copied from tests.models.t5.test_tokenization_t5.T5TokenizationTest.test_rust_and_python_full_tokenizers with T5->Siglip
def test_rust_and_python_full_tokenizers(self):
@ -227,10 +231,10 @@ class SiglipTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
added_tokens = [f"<extra_id_{i}>" for i in range(100)] + [AddedToken("<special>", lstrip=True)]
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
tokenizer_r = self.get_rust_tokenizer(
pretrained_name, additional_special_tokens=added_tokens, **kwargs
)
tokenizer_cr = self.rust_tokenizer_class.from_pretrained(
tokenizer_cr = self.get_rust_tokenizer(
pretrained_name, additional_special_tokens=added_tokens, **kwargs, from_slow=True
)
tokenizer_p = self.tokenizer_class.from_pretrained(

View File

@ -42,8 +42,9 @@ class SpeechToTextTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = False
test_sentencepiece = True
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
spm_model = sp.SentencePieceProcessor()
spm_model.Load(SAMPLE_VOCAB)
@ -52,13 +53,13 @@ class SpeechToTextTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
vocab += [spm_model.IdToPiece(id_) for id_ in range(len(spm_model))]
vocab_tokens = dict(zip(vocab, range(len(vocab))))
save_dir = Path(self.tmpdirname)
save_dir = Path(cls.tmpdirname)
save_json(vocab_tokens, save_dir / VOCAB_FILES_NAMES["vocab_file"])
if not (save_dir / VOCAB_FILES_NAMES["spm_file"]).exists():
copyfile(SAMPLE_VOCAB, save_dir / VOCAB_FILES_NAMES["spm_file"])
tokenizer = Speech2TextTokenizer.from_pretrained(self.tmpdirname)
tokenizer.save_pretrained(self.tmpdirname)
tokenizer = Speech2TextTokenizer.from_pretrained(cls.tmpdirname)
tokenizer.save_pretrained(cls.tmpdirname)
def test_convert_token_and_id(self):
"""Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""

View File

@ -35,8 +35,9 @@ class SpeechT5TokenizerTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = False
test_sentencepiece = True
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
# We have a SentencePiece fixture for testing
tokenizer = SpeechT5Tokenizer(SAMPLE_VOCAB)
@ -46,7 +47,7 @@ class SpeechT5TokenizerTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer.add_special_tokens({"mask_token": mask_token})
tokenizer.add_tokens(["<ctc_blank>"])
tokenizer.save_pretrained(self.tmpdirname)
tokenizer.save_pretrained(cls.tmpdirname)
def get_input_output_texts(self, tokenizer):
input_text = "this is a test"

View File

@ -13,8 +13,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from functools import lru_cache
from tests.test_tokenization_common import TokenizerTesterMixin
from tests.test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
from transformers import SplinterTokenizerFast, is_tf_available, is_torch_available
from transformers.models.splinter import SplinterTokenizer
from transformers.testing_utils import get_tests_dir, slow
@ -40,20 +41,29 @@ class SplinterTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
pre_trained_model_path = "tau/splinter-base"
# Copied from transformers.models.siglip.SiglipTokenizationTest.setUp
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
tokenizer = SplinterTokenizer(SAMPLE_VOCAB)
tokenizer.vocab["[UNK]"] = len(tokenizer.vocab)
tokenizer.vocab["[QUESTION]"] = len(tokenizer.vocab)
tokenizer.vocab["."] = len(tokenizer.vocab)
tokenizer.add_tokens("this is a test thou shall not determine rigor truly".split())
tokenizer.save_pretrained(self.tmpdirname)
tokenizer.save_pretrained(cls.tmpdirname)
def get_tokenizer(self, **kwargs) -> SplinterTokenizer:
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs) -> SplinterTokenizer:
pretrained_name = pretrained_name or cls.tmpdirname
return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
def get_rust_tokenizer(self, **kwargs) -> SplinterTokenizerFast:
return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_rust_tokenizer(cls, pretrained_name=None, **kwargs) -> SplinterTokenizerFast:
pretrained_name = pretrained_name or cls.tmpdirname
return cls.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
# Copied from transformers.models.siglip.SiglipTokenizationTest.test_get_vocab
def test_get_vocab(self):

View File

@ -13,22 +13,31 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from functools import lru_cache
from transformers import SqueezeBertTokenizer, SqueezeBertTokenizerFast
from transformers.testing_utils import require_tokenizers, slow
from ..bert.test_tokenization_bert import BertTokenizationTest
from ...test_tokenization_common import use_cache_if_possible
# Avoid import `BertTokenizationTest` directly as it will run as `test_tokenization_squeezebert.py::BertTokenizationTest`
# together with `test_tokenization_bert.py::BertTokenizationTest`.
from ..bert import test_tokenization_bert
@require_tokenizers
class SqueezeBertTokenizationTest(BertTokenizationTest):
class SqueezeBertTokenizationTest(test_tokenization_bert.BertTokenizationTest):
tokenizer_class = SqueezeBertTokenizer
rust_tokenizer_class = SqueezeBertTokenizerFast
test_rust_tokenizer = True
from_pretrained_id = "squeezebert/squeezebert-uncased"
def get_rust_tokenizer(self, **kwargs):
return SqueezeBertTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
pretrained_name = pretrained_name or cls.tmpdirname
return SqueezeBertTokenizerFast.from_pretrained(pretrained_name, **kwargs)
@slow
def test_sequence_builders(self):

View File

@ -17,12 +17,13 @@ import os
import re
import tempfile
import unittest
from functools import lru_cache
from transformers import SPIECE_UNDERLINE, AddedToken, BatchEncoding, T5Tokenizer, T5TokenizerFast
from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_seqio, require_tokenizers, slow
from transformers.utils import cached_property, is_tf_available, is_torch_available
from ...test_tokenization_common import TokenizerTesterMixin
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
@ -44,12 +45,13 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = True
test_sentencepiece = True
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
# We have a SentencePiece fixture for testing
tokenizer = T5Tokenizer(SAMPLE_VOCAB)
tokenizer.save_pretrained(self.tmpdirname)
tokenizer.save_pretrained(cls.tmpdirname)
def test_convert_token_and_id(self):
"""Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
@ -145,11 +147,19 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def t5_base_tokenizer_fast(self):
return T5TokenizerFast.from_pretrained("google-t5/t5-base")
def get_tokenizer(self, **kwargs) -> T5Tokenizer:
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs) -> T5Tokenizer:
pretrained_name = pretrained_name or cls.tmpdirname
return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
def get_rust_tokenizer(self, **kwargs) -> T5TokenizerFast:
return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_rust_tokenizer(cls, pretrained_name=None, **kwargs) -> T5TokenizerFast:
pretrained_name = pretrained_name or cls.tmpdirname
return cls.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
def test_rust_and_python_full_tokenizers(self):
if not self.test_rust_tokenizer:
@ -275,10 +285,10 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
added_tokens = [f"<extra_id_{i}>" for i in range(100)] + [AddedToken("<special>", lstrip=True)]
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
tokenizer_r = self.get_rust_tokenizer(
pretrained_name, additional_special_tokens=added_tokens, **kwargs
)
tokenizer_cr = self.rust_tokenizer_class.from_pretrained(
tokenizer_cr = self.get_rust_tokenizer(
pretrained_name, additional_special_tokens=added_tokens, **kwargs, from_slow=True
)
tokenizer_p = self.tokenizer_class.from_pretrained(
@ -460,10 +470,8 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
EXPECTED_WITH_SPACE = [9459, 149, 33, 25, 692, 1]
EXPECTED_WO_SPACE = [3845, 63, 149, 33, 25, 692, 1]
slow_ = self.tokenizer_class.from_pretrained(pretrained_name, add_prefix_space=False, legacy=False)
fast_ = self.rust_tokenizer_class.from_pretrained(
pretrained_name, add_prefix_space=False, legacy=False, from_slow=True
)
slow_ = self.get_tokenizer(pretrained_name, add_prefix_space=False, legacy=False)
fast_ = self.get_rust_tokenizer(pretrained_name, add_prefix_space=False, legacy=False, from_slow=True)
self.assertEqual(slow_.encode(inputs), EXPECTED_WO_SPACE)
self.assertEqual(slow_.encode(inputs), fast_.encode(inputs))
self.assertEqual(slow_.tokenize(inputs), ["He", "y", "▁how", "▁are", "▁you", "▁doing"])
@ -473,8 +481,8 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
fast_.decode(EXPECTED_WO_SPACE, skip_special_tokens=True),
)
slow_ = self.tokenizer_class.from_pretrained(pretrained_name, add_prefix_space=True, legacy=False)
fast_ = self.rust_tokenizer_class.from_pretrained(pretrained_name, add_prefix_space=True, legacy=False)
slow_ = self.get_tokenizer(pretrained_name, add_prefix_space=True, legacy=False)
fast_ = self.get_rust_tokenizer(pretrained_name, add_prefix_space=True, legacy=False)
self.assertEqual(slow_.encode(inputs), EXPECTED_WITH_SPACE)
self.assertEqual(slow_.encode(inputs), fast_.encode(inputs))
self.assertEqual(slow_.tokenize(inputs), ["▁Hey", "▁how", "▁are", "▁you", "▁doing"])

View File

@ -112,8 +112,9 @@ class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
return output_txt, output_ids
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
vocab_tokens = [
"[UNK]",
@ -132,8 +133,8 @@ class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
"low",
"lowest",
]
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
def get_input_output_texts(self, tokenizer):
@ -352,7 +353,7 @@ class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_offsets_with_special_characters(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence."
tokens = tokenizer_r.encode_plus(

View File

@ -93,12 +93,13 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
return questions, words, boxes
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
# We have a SentencePiece fixture for testing
tokenizer = UdopTokenizer(SAMPLE_VOCAB, keep_accents=True)
tokenizer.save_pretrained(self.tmpdirname)
tokenizer.save_pretrained(cls.tmpdirname)
def get_input_output_texts(self, tokenizer):
input_text = "UNwant\u00e9d,running"
@ -456,8 +457,8 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_padding(self, max_length=50):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
pad_token_id = tokenizer_p.pad_token_id
@ -922,8 +923,8 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
# Input tokens id
words, boxes = self.get_words_and_boxes()
@ -1109,7 +1110,7 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_offsets_mapping(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
text = ["a", "wonderful", "test"]
boxes = [[1, 8, 12, 20] for _ in range(len(text))]
@ -1239,8 +1240,8 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
words, boxes = self.get_words_and_boxes()
@ -1293,8 +1294,8 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
words, boxes = self.get_words_and_boxes()
tokens_r = tokenizer_r.encode_plus_boxes(
words,
@ -1320,7 +1321,7 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_compare_add_special_tokens(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
simple_num_special_tokens_to_add = tokenizer_r.num_special_tokens_to_add(pair=False)
@ -1402,7 +1403,7 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
added_tokens = [AddedToken("<special>", lstrip=True)]
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
tokenizer_r = self.get_rust_tokenizer(
pretrained_name, additional_special_tokens=added_tokens, **kwargs
)
words = "Hey this is a <special> token".split()
@ -1416,7 +1417,7 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertTrue(special_token_id in r_output)
if self.test_slow_tokenizer:
tokenizer_cr = self.rust_tokenizer_class.from_pretrained(
tokenizer_cr = self.get_rust_tokenizer(
pretrained_name, additional_special_tokens=added_tokens, **kwargs, from_slow=True
)
tokenizer_p = self.tokenizer_class.from_pretrained(
@ -1591,8 +1592,8 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
pad_token_id = tokenizer_p.pad_token_id

View File

@ -19,12 +19,13 @@ import os
import shutil
import tempfile
import unittest
from functools import lru_cache
from transformers import VitsTokenizer
from transformers.models.vits.tokenization_vits import VOCAB_FILES_NAMES
from transformers.testing_utils import slow
from ...test_tokenization_common import TokenizerTesterMixin
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
class VitsTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
@ -32,8 +33,9 @@ class VitsTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = VitsTokenizer
test_rust_tokenizer = False
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
vocab = (
"k ' z y u d h e s w 3 c p - 1 j m i X f l o 0 b r a 4 2 n _ x v t q 5 6 g ț ţ < > | <pad> <unk>".split(
@ -44,18 +46,22 @@ class VitsTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
vocab_tokens[" "] = vocab_tokens["X"]
del vocab_tokens["X"]
self.special_tokens_map = {"pad_token": "<pad>", "unk_token": "<unk>"}
cls.special_tokens_map = {"pad_token": "<pad>", "unk_token": "<unk>"}
self.tmpdirname = tempfile.mkdtemp()
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(self.vocab_file, "w", encoding="utf-8") as fp:
cls.tmpdirname = tempfile.mkdtemp()
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(cls.vocab_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(vocab_tokens) + "\n")
def get_tokenizer(self, **kwargs):
kwargs.update(self.special_tokens_map)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
kwargs["phonemize"] = False
kwargs["normalize"] = False
return VitsTokenizer.from_pretrained(self.tmpdirname, **kwargs)
pretrained_name = pretrained_name or cls.tmpdirname
return VitsTokenizer.from_pretrained(pretrained_name, **kwargs)
def get_clean_sequence(self, tokenizer, with_prefix_space=False, max_length=20, min_length=5):
txt = "beyonce lives in los angeles"

View File

@ -21,6 +21,7 @@ import random
import shutil
import tempfile
import unittest
from functools import lru_cache
import numpy as np
@ -33,7 +34,7 @@ from transformers import (
from transformers.models.wav2vec2.tokenization_wav2vec2 import VOCAB_FILES_NAMES, Wav2Vec2CTCTokenizerOutput
from transformers.testing_utils import require_torch, slow
from ...test_tokenization_common import TokenizerTesterMixin
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
global_rng = random.Random()
@ -57,22 +58,27 @@ def floats_list(shape, scale=1.0, rng=None, name=None):
class Wav2Vec2TokenizerTest(unittest.TestCase):
tokenizer_class = Wav2Vec2Tokenizer
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
vocab = "<pad> <s> </s> <unk> | E T A O N I H S R D L U M W C F G Y P B V K ' X J Q Z".split(" ")
vocab_tokens = dict(zip(vocab, range(len(vocab))))
self.special_tokens_map = {"pad_token": "<pad>", "unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"}
cls.special_tokens_map = {"pad_token": "<pad>", "unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"}
self.tmpdirname = tempfile.mkdtemp()
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(self.vocab_file, "w", encoding="utf-8") as fp:
cls.tmpdirname = tempfile.mkdtemp()
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(cls.vocab_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(vocab_tokens) + "\n")
def get_tokenizer(self, **kwargs):
kwargs.update(self.special_tokens_map)
return Wav2Vec2Tokenizer.from_pretrained(self.tmpdirname, **kwargs)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return Wav2Vec2Tokenizer.from_pretrained(pretrained_name, **kwargs)
def test_tokenizer_decode(self):
# TODO(PVP) - change to facebook
@ -237,7 +243,7 @@ class Wav2Vec2TokenizerTest(unittest.TestCase):
def test_save_pretrained(self):
pretrained_name = list(self.tokenizer_class.pretrained_vocab_files_map["vocab_file"].keys())[0]
tokenizer = self.tokenizer_class.from_pretrained(pretrained_name)
tokenizer = self.get_tokenizer(pretrained_name)
tmpdirname2 = tempfile.mkdtemp()
tokenizer_files = tokenizer.save_pretrained(tmpdirname2)
@ -373,22 +379,27 @@ class Wav2Vec2CTCTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = Wav2Vec2CTCTokenizer
test_rust_tokenizer = False
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
vocab = "<pad> <s> </s> <unk> | E T A O N I H S R D L U M W C F G Y P B V K ' X J Q Z".split(" ")
vocab_tokens = dict(zip(vocab, range(len(vocab))))
self.special_tokens_map = {"pad_token": "<pad>", "unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"}
cls.special_tokens_map = {"pad_token": "<pad>", "unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"}
self.tmpdirname = tempfile.mkdtemp()
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(self.vocab_file, "w", encoding="utf-8") as fp:
cls.tmpdirname = tempfile.mkdtemp()
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(cls.vocab_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(vocab_tokens) + "\n")
def get_tokenizer(self, **kwargs):
kwargs.update(self.special_tokens_map)
return Wav2Vec2CTCTokenizer.from_pretrained(self.tmpdirname, **kwargs)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return Wav2Vec2CTCTokenizer.from_pretrained(pretrained_name, **kwargs)
def test_tokenizer_add_token_chars(self):
tokenizer = self.tokenizer_class.from_pretrained("facebook/wav2vec2-base-960h")

View File

@ -17,6 +17,7 @@
import json
import os
import unittest
from functools import lru_cache
from typing import Tuple
from transformers import Wav2Vec2PhonemeCTCTokenizer
@ -24,7 +25,7 @@ from transformers.models.wav2vec2.tokenization_wav2vec2 import VOCAB_FILES_NAMES
from transformers.models.wav2vec2_phoneme.tokenization_wav2vec2_phoneme import Wav2Vec2PhonemeCTCTokenizerOutput
from transformers.testing_utils import require_phonemizer
from ...test_tokenization_common import TokenizerTesterMixin
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
@require_phonemizer
@ -33,8 +34,9 @@ class Wav2Vec2PhonemeCTCTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = Wav2Vec2PhonemeCTCTokenizer
test_rust_tokenizer = False
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
vocab = (
"<s> <pad> </s> <unk> n s t ə l a i k d m ɛ ɾ e ɪ p o ɐ z ð f j v b ɹ ʁ ʊ iː r w ʌ u ɡ æ aɪ ʃ h ɔ ɑː "
@ -53,10 +55,10 @@ class Wav2Vec2PhonemeCTCTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
).split(" ")
vocab_tokens = dict(zip(vocab, range(len(vocab))))
self.special_tokens_map = {"pad_token": "<pad>", "unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"}
cls.special_tokens_map = {"pad_token": "<pad>", "unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"}
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(self.vocab_file, "w", encoding="utf-8") as fp:
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(cls.vocab_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(vocab_tokens) + "\n")
# overwrite since phonemes require specific creation
@ -84,9 +86,13 @@ class Wav2Vec2PhonemeCTCTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
output_ids = tokenizer.encode(output_txt, add_special_tokens=False)
return output_txt, output_ids
def get_tokenizer(self, **kwargs):
kwargs.update(self.special_tokens_map)
return Wav2Vec2PhonemeCTCTokenizer.from_pretrained(self.tmpdirname, **kwargs)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs):
kwargs.update(cls.special_tokens_map)
pretrained_name = pretrained_name or cls.tmpdirname
return Wav2Vec2PhonemeCTCTokenizer.from_pretrained(pretrained_name, **kwargs)
def test_tokenizer_add_new_tokens(self):
tokenizer = self.tokenizer_class.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft")

View File

@ -40,12 +40,13 @@ class WhisperTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
test_sentencepiece = False
test_seq2seq = False
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny")
tokenizer.pad_token_id = 50256
tokenizer.pad_token = "<|endoftext|>"
tokenizer.save_pretrained(self.tmpdirname)
tokenizer.save_pretrained(cls.tmpdirname)
def test_convert_token_and_id(self):
"""Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""

View File

@ -37,12 +37,13 @@ class XGLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = True
test_sentencepiece = True
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
# We have a SentencePiece fixture for testing
tokenizer = XGLMTokenizer(SAMPLE_VOCAB, keep_accents=True)
tokenizer.save_pretrained(self.tmpdirname)
tokenizer.save_pretrained(cls.tmpdirname)
def test_convert_token_and_id(self):
"""Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""

View File

@ -29,8 +29,9 @@ class XLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = XLMTokenizer
test_rust_tokenizer = False
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
vocab = [
@ -59,11 +60,11 @@ class XLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""]
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(self.vocab_file, "w") as fp:
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(cls.vocab_file, "w") as fp:
fp.write(json.dumps(vocab_tokens))
with open(self.merges_file, "w") as fp:
with open(cls.merges_file, "w") as fp:
fp.write("\n".join(merges))
def get_input_output_texts(self, tokenizer):

View File

@ -37,12 +37,13 @@ class XLMRobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = True
test_sentencepiece = True
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
# We have a SentencePiece fixture for testing
tokenizer = XLMRobertaTokenizer(SAMPLE_VOCAB, keep_accents=True)
tokenizer.save_pretrained(self.tmpdirname)
tokenizer.save_pretrained(cls.tmpdirname)
def test_convert_token_and_id(self):
"""Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
@ -148,8 +149,8 @@ class XLMRobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.tokenizers_list[0] = (self.rust_tokenizer_class, "hf-internal-testing/tiny-xlm-roberta", {})
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
tmpdirname2 = tempfile.mkdtemp()

View File

@ -33,12 +33,13 @@ class XLNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = True
test_sentencepiece = True
def setUp(self):
super().setUp()
@classmethod
def setUpClass(cls):
super().setUpClass()
# We have a SentencePiece fixture for testing
tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
tokenizer.save_pretrained(self.tmpdirname)
tokenizer.save_pretrained(cls.tmpdirname)
def test_convert_token_and_id(self):
"""Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""

View File

@ -13,6 +13,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
import functools
import inspect
import itertools
import json
@ -24,6 +26,7 @@ import tempfile
import traceback
import unittest
from collections import OrderedDict
from functools import lru_cache
from itertools import takewhile
from pathlib import Path
from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Union
@ -69,6 +72,38 @@ if TYPE_CHECKING:
from transformers import PretrainedConfig, PreTrainedModel, TFPreTrainedModel
def use_cache_if_possible(func):
@functools.wraps(func)
def wrapper(*args, **kwargs):
use_cache = kwargs.pop("use_cache", True)
underline_func = func
if "functools" in str(func):
underline_func = func.__wrapped__
if not use_cache:
return underline_func(*args, **kwargs)
if any(not arg.__hash__ for arg in args):
return underline_func(*args, **kwargs)
elif any(not kwarg.__hash__ for kwarg in kwargs.values()):
return underline_func(*args, **kwargs)
cached = func(*args, **kwargs)
copied = copy.deepcopy(cached)
if hasattr(copied, "_tokenizer") and "tests.models.clip.test_tokenization_clip.CLIPTokenizationTest" in str(
args[0]
):
copied._tokenizer = cached._tokenizer
if hasattr(copied, "sp_model"):
copied.sp_model = cached.sp_model
return copied
return wrapper
logger = logging.get_logger(__name__)
NON_ENGLISH_TAGS = ["chinese", "dutch", "french", "finnish", "german", "multilingual"]
@ -198,32 +233,34 @@ class TokenizerTesterMixin:
# test_sentencepiece must also be set to True
test_sentencepiece_ignore_case = False
def setUp(self) -> None:
@classmethod
def setUpClass(cls) -> None:
# Tokenizer.filter makes it possible to filter which Tokenizer to case based on all the
# information available in Tokenizer (name, rust class, python class, vocab key name)
self.from_pretrained_id = (
[self.from_pretrained_id] if isinstance(self.from_pretrained_id, str) else self.from_pretrained_id
cls.from_pretrained_id = (
[cls.from_pretrained_id] if isinstance(cls.from_pretrained_id, str) else cls.from_pretrained_id
)
self.tokenizers_list = []
if self.test_rust_tokenizer:
self.tokenizers_list = [
cls.tokenizers_list = []
if cls.test_rust_tokenizer:
cls.tokenizers_list = [
(
self.rust_tokenizer_class,
cls.rust_tokenizer_class,
pretrained_id,
self.from_pretrained_kwargs if self.from_pretrained_kwargs is not None else {},
cls.from_pretrained_kwargs if cls.from_pretrained_kwargs is not None else {},
)
for pretrained_id in self.from_pretrained_id
for pretrained_id in cls.from_pretrained_id
]
else:
self.tokenizers_list = []
cls.tokenizers_list = []
with open(f"{get_tests_dir()}/fixtures/sample_text.txt", encoding="utf-8") as f_data:
self._data = f_data.read().replace("\n\n", "\n").strip()
cls._data = f_data.read().replace("\n\n", "\n").strip()
self.tmpdirname = tempfile.mkdtemp()
cls.tmpdirname = tempfile.mkdtemp()
def tearDown(self):
shutil.rmtree(self.tmpdirname)
@classmethod
def tearDownClass(cls):
shutil.rmtree(cls.tmpdirname)
def get_input_output_texts(self, tokenizer):
input_txt = self.get_clean_sequence(tokenizer)[0]
@ -267,11 +304,19 @@ class TokenizerTesterMixin:
else:
raise ValueError("This tokenizer class has no tokenizer to be tested.")
def get_tokenizer(self, **kwargs) -> PreTrainedTokenizer:
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_tokenizer(cls, pretrained_name=None, **kwargs) -> PreTrainedTokenizer:
pretrained_name = pretrained_name or cls.tmpdirname
return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
def get_rust_tokenizer(self, **kwargs) -> PreTrainedTokenizerFast:
return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
@classmethod
@use_cache_if_possible
@lru_cache(maxsize=64)
def get_rust_tokenizer(cls, pretrained_name=None, **kwargs) -> PreTrainedTokenizerFast:
pretrained_name = pretrained_name or cls.tmpdirname
return cls.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
def tokenizer_integration_test_util(
self,
@ -1263,7 +1308,7 @@ class TokenizerTesterMixin:
if not self.test_rust_tokenizer:
self.skipTest(reason="No fast tokenizer defined")
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name)
tokenizer_r = self.get_rust_tokenizer(pretrained_name)
self._check_no_pad_token_padding(tokenizer_r, conversations)
tokenizer_r.padding_side = "right"
@ -1446,7 +1491,7 @@ class TokenizerTesterMixin:
if not self.test_rust_tokenizer:
self.skipTest(reason="No fast tokenizer defined")
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name)
tokenizer_r = self.get_rust_tokenizer(pretrained_name)
# Find where to truncate, as the amount of tokens is different for different tokenizers and I want the
# truncation to happen in the middle of the assistant content.
@ -2050,11 +2095,9 @@ class TokenizerTesterMixin:
if self.rust_tokenizer_class is not None:
pretrained_name = self.from_pretrained_id
slow_tokenizer = self.tokenizer_class.from_pretrained(pretrained_name, legacy=False)
slow_tokenizer = self.get_tokenizer(pretrained_name, legacy=False)
with self.subTest(f"{pretrained_name}"):
rust_tokenizer = self.rust_tokenizer_class.from_pretrained(
pretrained_name, from_slow=True, legacy=False
)
rust_tokenizer = self.get_rust_tokenizer(pretrained_name, from_slow=True, legacy=False)
input_full_vocab_ids = list(
range(len(slow_tokenizer))
) # TODO let's maybe shuffle this! And run it 4 times. This way we cover more cmbinations
@ -2200,14 +2243,10 @@ class TokenizerTesterMixin:
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
if self.test_rust_tokenizer:
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
pretrained_name, padding_side="left", **kwargs
)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, padding_side="left", **kwargs)
self.assertEqual(tokenizer_r.padding_side, "left")
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
pretrained_name, padding_side="right", **kwargs
)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, padding_side="right", **kwargs)
self.assertEqual(tokenizer_r.padding_side, "right")
self.assertRaises(
@ -2219,10 +2258,10 @@ class TokenizerTesterMixin:
)
if self.test_slow_tokenizer:
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, padding_side="left", **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, padding_side="left", **kwargs)
self.assertEqual(tokenizer_p.padding_side, "left")
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, padding_side="right", **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, padding_side="right", **kwargs)
self.assertEqual(tokenizer_p.padding_side, "right")
self.assertRaises(
@ -2237,14 +2276,10 @@ class TokenizerTesterMixin:
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
if self.test_rust_tokenizer:
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
pretrained_name, truncation_side="left", **kwargs
)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, truncation_side="left", **kwargs)
self.assertEqual(tokenizer_r.truncation_side, "left")
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
pretrained_name, truncation_side="right", **kwargs
)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, truncation_side="right", **kwargs)
self.assertEqual(tokenizer_r.truncation_side, "right")
self.assertRaises(
@ -2256,14 +2291,10 @@ class TokenizerTesterMixin:
)
if self.test_slow_tokenizer:
tokenizer_p = self.tokenizer_class.from_pretrained(
pretrained_name, truncation_side="left", **kwargs
)
tokenizer_p = self.get_tokenizer(pretrained_name, truncation_side="left", **kwargs)
self.assertEqual(tokenizer_p.truncation_side, "left")
tokenizer_p = self.tokenizer_class.from_pretrained(
pretrained_name, truncation_side="right", **kwargs
)
tokenizer_p = self.get_tokenizer(pretrained_name, truncation_side="right", **kwargs)
self.assertEqual(tokenizer_p.truncation_side, "right")
self.assertRaises(
@ -3194,18 +3225,18 @@ class TokenizerTesterMixin:
def test_is_fast(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
# Check is_fast is set correctly
self.assertTrue(tokenizer_r.is_fast)
if self.test_slow_tokenizer:
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
self.assertFalse(tokenizer_p.is_fast)
def test_fast_only_inputs(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
# Ensure None raise an error
self.assertRaises(TypeError, tokenizer_r.tokenize, None)
@ -3216,7 +3247,7 @@ class TokenizerTesterMixin:
def test_alignement_methods(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
words = ["Wonderful", "no", "inspiration", "example", "with", "subtoken"]
text = " ".join(words)
@ -3446,8 +3477,8 @@ class TokenizerTesterMixin:
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
# Ensure basic input match
input_p = tokenizer_p.encode_plus(self._data)
@ -3487,8 +3518,8 @@ class TokenizerTesterMixin:
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
# Check we have the same number of added_tokens for both pair and non-pair inputs.
self.assertEqual(
@ -3505,8 +3536,8 @@ class TokenizerTesterMixin:
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
# Check we have the correct max_length for both pair and non-pair inputs.
self.assertEqual(tokenizer_r.max_len_single_sentence, tokenizer_p.max_len_single_sentence)
@ -3520,8 +3551,8 @@ class TokenizerTesterMixin:
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
# sometimes the tokenizer saved online is not the same
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
# Assert the set of special tokens match.
self.assertSequenceEqual(
@ -3532,7 +3563,7 @@ class TokenizerTesterMixin:
def test_add_tokens(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
vocab_size = len(tokenizer_r)
self.assertEqual(tokenizer_r.add_tokens(""), 0)
@ -3558,7 +3589,7 @@ class TokenizerTesterMixin:
def test_offsets_mapping(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
text = "Wonderful no inspiration example with subtoken"
pair = "Along with an awesome pair"
@ -3601,7 +3632,7 @@ class TokenizerTesterMixin:
This needs to be padded so that it can represented as a tensor
"""
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
tokenizer = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer = self.get_rust_tokenizer(pretrained_name, **kwargs)
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name}, {tokenizer.__class__.__name__})"):
if is_torch_available():
@ -3663,8 +3694,8 @@ class TokenizerTesterMixin:
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
if hasattr(tokenizer_p, "add_prefix_space") and not tokenizer_p.add_prefix_space:
continue # Too hard to test for now
@ -3745,8 +3776,8 @@ class TokenizerTesterMixin:
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
input_simple = [1, 2, 3]
input_pair = [1, 2, 3]
@ -3767,8 +3798,8 @@ class TokenizerTesterMixin:
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
# # Input string
# input_simple = tokenizer_p.tokenize("This is a sample input", add_special_tokens=False)
# input_pair = tokenizer_p.tokenize("This is a sample pair", add_special_tokens=False)
@ -3812,8 +3843,8 @@ class TokenizerTesterMixin:
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
pad_token_id = tokenizer_p.pad_token_id
@ -4038,8 +4069,8 @@ class TokenizerTesterMixin:
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
pad_token_id = tokenizer_p.pad_token_id
@ -4076,8 +4107,8 @@ class TokenizerTesterMixin:
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
tmpdirname2 = tempfile.mkdtemp()
@ -4151,8 +4182,8 @@ class TokenizerTesterMixin:
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
sentence = "A, <mask> AllenNLP sentence."
tokens_r = tokenizer_r.encode_plus(
sentence,
@ -4176,7 +4207,7 @@ class TokenizerTesterMixin:
def test_compare_add_special_tokens(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
simple_num_special_tokens_to_add = tokenizer_r.num_special_tokens_to_add(pair=False)
# pair_num_special_tokens_to_add = tokenizer_r.num_special_tokens_to_add(pair=True)
@ -4219,8 +4250,8 @@ class TokenizerTesterMixin:
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
string_sequence = "Asserting that both tokenizers are equal"
python_output = tokenizer_p.prepare_for_model(
tokenizer_p.encode(string_sequence, add_special_tokens=False)
@ -4235,7 +4266,7 @@ class TokenizerTesterMixin:
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
added_tokens = [AddedToken("<special>", lstrip=True)]
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
tokenizer_r = self.get_rust_tokenizer(
pretrained_name, additional_special_tokens=added_tokens, **kwargs
)
r_output = tokenizer_r.encode("Hey this is a <special> token")
@ -4246,12 +4277,10 @@ class TokenizerTesterMixin:
if self.test_slow_tokenizer:
# in rust fast, you lose the information of the AddedToken when initializing with `additional_special_tokens`
tokenizer_cr = self.rust_tokenizer_class.from_pretrained(
tokenizer_cr = self.get_rust_tokenizer(
pretrained_name, additional_special_tokens=added_tokens, **kwargs, from_slow=True
)
tokenizer_p = self.tokenizer_class.from_pretrained(
pretrained_name, additional_special_tokens=added_tokens, **kwargs
)
tokenizer_p = self.get_tokenizer(pretrained_name, additional_special_tokens=added_tokens, **kwargs)
p_output = tokenizer_p.encode("Hey this is a <special> token")
@ -4498,7 +4527,7 @@ class TokenizerTesterMixin:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
with tempfile.TemporaryDirectory() as tmp_dir:
# Save the fast tokenizer files in a temporary directory
tokenizer_old = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs, use_fast=True)
tokenizer_old = self.get_rust_tokenizer(pretrained_name, **kwargs, use_fast=True)
tokenizer_old.save_pretrained(tmp_dir, legacy_format=False) # save only fast version
# Initialize toy model for the trainer
@ -4532,13 +4561,11 @@ class TokenizerTesterMixin:
with tempfile.TemporaryDirectory() as tmp_dir_1:
# Here we check that even if we have initialized a fast tokenizer with a tokenizer_file we can
# still save only the slow version and use these saved files to rebuild a tokenizer
tokenizer_fast_old_1 = self.rust_tokenizer_class.from_pretrained(
pretrained_name, **kwargs, use_fast=True
)
tokenizer_fast_old_1 = self.get_rust_tokenizer(pretrained_name, **kwargs, use_fast=True)
tokenizer_file = os.path.join(tmp_dir_1, "tokenizer.json")
tokenizer_fast_old_1.backend_tokenizer.save(tokenizer_file)
tokenizer_fast_old_2 = self.rust_tokenizer_class.from_pretrained(
tokenizer_fast_old_2 = self.get_rust_tokenizer(
pretrained_name, **kwargs, use_fast=True, tokenizer_file=tokenizer_file
)
@ -4560,10 +4587,10 @@ class TokenizerTesterMixin:
special_token = "<my_new_token>"
special_sentence = f"Hey this is a {special_token} token"
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_rust = self.rust_tokenizer_class.from_pretrained(
tokenizer_rust = self.get_rust_tokenizer(
pretrained_name, additional_special_tokens=[special_token], split_special_tokens=True, **kwargs
)
tokenizer_py = self.tokenizer_class.from_pretrained(
tokenizer_py = self.get_tokenizer(
pretrained_name, additional_special_tokens=[special_token], split_special_tokens=True, **kwargs
)
@ -4622,7 +4649,7 @@ class TokenizerTesterMixin:
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
# Load a slow tokenizer from the hub, init with the new token for fast to also include it
tokenizer = self.tokenizer_class.from_pretrained(pretrained_name, eos_token=new_eos)
tokenizer = self.get_tokenizer(pretrained_name, eos_token=new_eos)
EXPECTED_ADDED_TOKENS_DECODER = tokenizer.added_tokens_decoder
with self.subTest("Hub -> Slow: Test loading a slow tokenizer from the hub)"):
self.assertEqual(tokenizer._special_tokens_map["eos_token"], new_eos)
@ -4662,7 +4689,7 @@ class TokenizerTesterMixin:
with self.subTest("Hub -> Fast: Test loading a fast tokenizer from the hub)"):
if self.rust_tokenizer_class is not None:
tokenizer_fast = self.rust_tokenizer_class.from_pretrained(pretrained_name, eos_token=new_eos)
tokenizer_fast = self.get_rust_tokenizer(pretrained_name, eos_token=new_eos)
self.assertEqual(tokenizer_fast._special_tokens_map["eos_token"], new_eos)
self.assertIn(new_eos, list(tokenizer_fast.added_tokens_decoder.values()))
# We can't test the following because for BC we kept the default rstrip lstrip in slow not fast. Will comment once normalization is alright

View File

@ -33,19 +33,20 @@ class PreTrainedTokenizationFastTest(TokenizerTesterMixin, unittest.TestCase):
test_rust_tokenizer = True
from_pretrained_vocab_key = "tokenizer_file"
def setUp(self):
self.test_rust_tokenizer = False # because we don't have pretrained_vocab_files_map
super().setUp()
self.test_rust_tokenizer = True
@classmethod
def setUpClass(cls):
cls.test_rust_tokenizer = False # because we don't have pretrained_vocab_files_map
super().setUpClass()
cls.test_rust_tokenizer = True
model_paths = ["robot-test/dummy-tokenizer-fast", "robot-test/dummy-tokenizer-wordlevel"]
self.bytelevel_bpe_model_name = "SaulLu/dummy-tokenizer-bytelevel-bpe"
cls.bytelevel_bpe_model_name = "SaulLu/dummy-tokenizer-bytelevel-bpe"
# Inclusion of 2 tokenizers to test different types of models (Unigram and WordLevel for the moment)
self.tokenizers_list = [(PreTrainedTokenizerFast, model_path, {}) for model_path in model_paths]
cls.tokenizers_list = [(PreTrainedTokenizerFast, model_path, {}) for model_path in model_paths]
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_paths[0])
tokenizer.save_pretrained(self.tmpdirname)
tokenizer.save_pretrained(cls.tmpdirname)
@unittest.skip(
"We disable this test for PreTrainedTokenizerFast because it is the only tokenizer that is not linked to any model"