mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-16 11:08:23 +06:00
Use lru_cache
for tokenization tests (#36818)
* fix * fix * fix * fix --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
parent
3af425d4c6
commit
1fcaad6df9
@ -34,12 +34,13 @@ class AlbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
test_sentencepiece = True
|
test_sentencepiece = True
|
||||||
test_sentencepiece_ignore_case = True
|
test_sentencepiece_ignore_case = True
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
# We have a SentencePiece fixture for testing
|
# We have a SentencePiece fixture for testing
|
||||||
tokenizer = AlbertTokenizer(SAMPLE_VOCAB)
|
tokenizer = AlbertTokenizer(SAMPLE_VOCAB)
|
||||||
tokenizer.save_pretrained(self.tmpdirname)
|
tokenizer.save_pretrained(cls.tmpdirname)
|
||||||
|
|
||||||
def get_input_output_texts(self, tokenizer):
|
def get_input_output_texts(self, tokenizer):
|
||||||
input_text = "this is a test"
|
input_text = "this is a test"
|
||||||
|
@ -14,13 +14,14 @@
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import unittest
|
import unittest
|
||||||
|
from functools import lru_cache
|
||||||
|
|
||||||
from transformers import BartTokenizer, BartTokenizerFast, BatchEncoding
|
from transformers import BartTokenizer, BartTokenizerFast, BatchEncoding
|
||||||
from transformers.models.roberta.tokenization_roberta import VOCAB_FILES_NAMES
|
from transformers.models.roberta.tokenization_roberta import VOCAB_FILES_NAMES
|
||||||
from transformers.testing_utils import require_tokenizers, require_torch
|
from transformers.testing_utils import require_tokenizers, require_torch
|
||||||
from transformers.utils import cached_property
|
from transformers.utils import cached_property
|
||||||
|
|
||||||
from ...test_tokenization_common import TokenizerTesterMixin, filter_roberta_detectors
|
from ...test_tokenization_common import TokenizerTesterMixin, filter_roberta_detectors, use_cache_if_possible
|
||||||
|
|
||||||
|
|
||||||
@require_tokenizers
|
@require_tokenizers
|
||||||
@ -32,8 +33,10 @@ class TestTokenizationBart(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
from_pretrained_filter = filter_roberta_detectors
|
from_pretrained_filter = filter_roberta_detectors
|
||||||
# from_pretrained_kwargs = {'add_prefix_space': True}
|
# from_pretrained_kwargs = {'add_prefix_space': True}
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
vocab = [
|
vocab = [
|
||||||
"l",
|
"l",
|
||||||
"o",
|
"o",
|
||||||
@ -58,22 +61,30 @@ class TestTokenizationBart(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
]
|
]
|
||||||
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
||||||
merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
|
merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
|
||||||
self.special_tokens_map = {"unk_token": "<unk>"}
|
cls.special_tokens_map = {"unk_token": "<unk>"}
|
||||||
|
|
||||||
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
||||||
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
|
cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
|
||||||
with open(self.vocab_file, "w", encoding="utf-8") as fp:
|
with open(cls.vocab_file, "w", encoding="utf-8") as fp:
|
||||||
fp.write(json.dumps(vocab_tokens) + "\n")
|
fp.write(json.dumps(vocab_tokens) + "\n")
|
||||||
with open(self.merges_file, "w", encoding="utf-8") as fp:
|
with open(cls.merges_file, "w", encoding="utf-8") as fp:
|
||||||
fp.write("\n".join(merges))
|
fp.write("\n".join(merges))
|
||||||
|
|
||||||
def get_tokenizer(self, **kwargs):
|
@classmethod
|
||||||
kwargs.update(self.special_tokens_map)
|
@use_cache_if_possible
|
||||||
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
|
@lru_cache(maxsize=64)
|
||||||
|
def get_tokenizer(cls, pretrained_name=None, **kwargs):
|
||||||
|
kwargs.update(cls.special_tokens_map)
|
||||||
|
pretrained_name = pretrained_name or cls.tmpdirname
|
||||||
|
return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
def get_rust_tokenizer(self, **kwargs):
|
@classmethod
|
||||||
kwargs.update(self.special_tokens_map)
|
@use_cache_if_possible
|
||||||
return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
|
@lru_cache(maxsize=64)
|
||||||
|
def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
|
||||||
|
kwargs.update(cls.special_tokens_map)
|
||||||
|
pretrained_name = pretrained_name or cls.tmpdirname
|
||||||
|
return cls.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
def get_input_output_texts(self, tokenizer):
|
def get_input_output_texts(self, tokenizer):
|
||||||
return "lower newer", "lower newer"
|
return "lower newer", "lower newer"
|
||||||
@ -154,8 +165,8 @@ class TestTokenizationBart(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
def test_embeded_special_tokens(self):
|
def test_embeded_special_tokens(self):
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
sentence = "A, <mask> AllenNLP sentence."
|
sentence = "A, <mask> AllenNLP sentence."
|
||||||
tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
|
tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
|
||||||
tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
|
tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
|
||||||
|
@ -31,13 +31,14 @@ class BarthezTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
test_rust_tokenizer = True
|
test_rust_tokenizer = True
|
||||||
test_sentencepiece = True
|
test_sentencepiece = True
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
tokenizer = BarthezTokenizerFast.from_pretrained("moussaKam/mbarthez")
|
tokenizer = BarthezTokenizerFast.from_pretrained("moussaKam/mbarthez")
|
||||||
tokenizer.save_pretrained(self.tmpdirname)
|
tokenizer.save_pretrained(cls.tmpdirname)
|
||||||
tokenizer.save_pretrained(self.tmpdirname, legacy_format=False)
|
tokenizer.save_pretrained(cls.tmpdirname, legacy_format=False)
|
||||||
self.tokenizer = tokenizer
|
cls.tokenizer = tokenizer
|
||||||
|
|
||||||
def test_convert_token_and_id(self):
|
def test_convert_token_and_id(self):
|
||||||
"""Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
|
"""Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
|
||||||
|
@ -15,11 +15,12 @@
|
|||||||
|
|
||||||
import os
|
import os
|
||||||
import unittest
|
import unittest
|
||||||
|
from functools import lru_cache
|
||||||
|
|
||||||
from transformers.models.bartpho.tokenization_bartpho import VOCAB_FILES_NAMES, BartphoTokenizer
|
from transformers.models.bartpho.tokenization_bartpho import VOCAB_FILES_NAMES, BartphoTokenizer
|
||||||
from transformers.testing_utils import get_tests_dir
|
from transformers.testing_utils import get_tests_dir
|
||||||
|
|
||||||
from ...test_tokenization_common import TokenizerTesterMixin
|
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
|
||||||
|
|
||||||
|
|
||||||
SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece_bpe.model")
|
SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece_bpe.model")
|
||||||
@ -31,24 +32,29 @@ class BartphoTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
test_rust_tokenizer = False
|
test_rust_tokenizer = False
|
||||||
test_sentencepiece = True
|
test_sentencepiece = True
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
vocab = ["▁This", "▁is", "▁a", "▁t", "est"]
|
vocab = ["▁This", "▁is", "▁a", "▁t", "est"]
|
||||||
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
||||||
self.special_tokens_map = {"unk_token": "<unk>"}
|
cls.special_tokens_map = {"unk_token": "<unk>"}
|
||||||
|
|
||||||
self.monolingual_vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["monolingual_vocab_file"])
|
cls.monolingual_vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["monolingual_vocab_file"])
|
||||||
with open(self.monolingual_vocab_file, "w", encoding="utf-8") as fp:
|
with open(cls.monolingual_vocab_file, "w", encoding="utf-8") as fp:
|
||||||
for token in vocab_tokens:
|
for token in vocab_tokens:
|
||||||
fp.write(f"{token} {vocab_tokens[token]}\n")
|
fp.write(f"{token} {vocab_tokens[token]}\n")
|
||||||
|
|
||||||
tokenizer = BartphoTokenizer(SAMPLE_VOCAB, self.monolingual_vocab_file, **self.special_tokens_map)
|
tokenizer = BartphoTokenizer(SAMPLE_VOCAB, cls.monolingual_vocab_file, **cls.special_tokens_map)
|
||||||
tokenizer.save_pretrained(self.tmpdirname)
|
tokenizer.save_pretrained(cls.tmpdirname)
|
||||||
|
|
||||||
def get_tokenizer(self, **kwargs):
|
@classmethod
|
||||||
kwargs.update(self.special_tokens_map)
|
@use_cache_if_possible
|
||||||
return BartphoTokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
@lru_cache(maxsize=64)
|
||||||
|
def get_tokenizer(cls, pretrained_name=None, **kwargs):
|
||||||
|
kwargs.update(cls.special_tokens_map)
|
||||||
|
pretrained_name = pretrained_name or cls.tmpdirname
|
||||||
|
return BartphoTokenizer.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
def get_input_output_texts(self, tokenizer):
|
def get_input_output_texts(self, tokenizer):
|
||||||
input_text = "This is a là test"
|
input_text = "This is a là test"
|
||||||
|
@ -41,8 +41,9 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
space_between_special_tokens = True
|
space_between_special_tokens = True
|
||||||
from_pretrained_filter = filter_non_english
|
from_pretrained_filter = filter_non_english
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
vocab_tokens = [
|
vocab_tokens = [
|
||||||
"[UNK]",
|
"[UNK]",
|
||||||
@ -61,8 +62,8 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
"low",
|
"low",
|
||||||
"lowest",
|
"lowest",
|
||||||
]
|
]
|
||||||
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
||||||
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
|
with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
|
||||||
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
|
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
|
||||||
|
|
||||||
def get_input_output_texts(self, tokenizer):
|
def get_input_output_texts(self, tokenizer):
|
||||||
@ -257,7 +258,7 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
def test_offsets_with_special_characters(self):
|
def test_offsets_with_special_characters(self):
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence."
|
sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence."
|
||||||
tokens = tokenizer_r.encode_plus(
|
tokens = tokenizer_r.encode_plus(
|
||||||
@ -312,8 +313,8 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
kwargs["tokenize_chinese_chars"] = True
|
kwargs["tokenize_chinese_chars"] = True
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)
|
ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)
|
||||||
ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
|
ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
|
||||||
@ -326,8 +327,8 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
self.assertListEqual(tokens_without_spe_char_r, list_of_commun_chinese_char)
|
self.assertListEqual(tokens_without_spe_char_r, list_of_commun_chinese_char)
|
||||||
|
|
||||||
kwargs["tokenize_chinese_chars"] = False
|
kwargs["tokenize_chinese_chars"] = False
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
|
ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
|
||||||
ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)
|
ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)
|
||||||
|
@ -34,11 +34,12 @@ class BertGenerationTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
test_rust_tokenizer = False
|
test_rust_tokenizer = False
|
||||||
test_sentencepiece = True
|
test_sentencepiece = True
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
tokenizer = BertGenerationTokenizer(SAMPLE_VOCAB, keep_accents=True)
|
tokenizer = BertGenerationTokenizer(SAMPLE_VOCAB, keep_accents=True)
|
||||||
tokenizer.save_pretrained(self.tmpdirname)
|
tokenizer.save_pretrained(cls.tmpdirname)
|
||||||
|
|
||||||
def test_convert_token_and_id(self):
|
def test_convert_token_and_id(self):
|
||||||
"""Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
|
"""Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
|
||||||
|
@ -17,6 +17,7 @@
|
|||||||
import os
|
import os
|
||||||
import pickle
|
import pickle
|
||||||
import unittest
|
import unittest
|
||||||
|
from functools import lru_cache
|
||||||
|
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
from transformers.models.bert.tokenization_bert import BertTokenizer
|
from transformers.models.bert.tokenization_bert import BertTokenizer
|
||||||
@ -31,7 +32,7 @@ from transformers.models.bert_japanese.tokenization_bert_japanese import (
|
|||||||
)
|
)
|
||||||
from transformers.testing_utils import custom_tokenizers, require_jumanpp, require_sudachi_projection
|
from transformers.testing_utils import custom_tokenizers, require_jumanpp, require_sudachi_projection
|
||||||
|
|
||||||
from ...test_tokenization_common import TokenizerTesterMixin
|
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
|
||||||
|
|
||||||
|
|
||||||
@custom_tokenizers
|
@custom_tokenizers
|
||||||
@ -41,8 +42,9 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
test_rust_tokenizer = False
|
test_rust_tokenizer = False
|
||||||
space_between_special_tokens = True
|
space_between_special_tokens = True
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
vocab_tokens = [
|
vocab_tokens = [
|
||||||
"[UNK]",
|
"[UNK]",
|
||||||
@ -72,8 +74,8 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
"です",
|
"です",
|
||||||
]
|
]
|
||||||
|
|
||||||
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
||||||
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
|
with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
|
||||||
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
|
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
|
||||||
|
|
||||||
def get_input_output_texts(self, tokenizer):
|
def get_input_output_texts(self, tokenizer):
|
||||||
@ -408,17 +410,21 @@ class BertJapaneseCharacterTokenizationTest(TokenizerTesterMixin, unittest.TestC
|
|||||||
tokenizer_class = BertJapaneseTokenizer
|
tokenizer_class = BertJapaneseTokenizer
|
||||||
test_rust_tokenizer = False
|
test_rust_tokenizer = False
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "こ", "ん", "に", "ち", "は", "ば", "世", "界", "、", "。"]
|
vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "こ", "ん", "に", "ち", "は", "ば", "世", "界", "、", "。"]
|
||||||
|
|
||||||
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
||||||
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
|
with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
|
||||||
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
|
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
|
||||||
|
|
||||||
def get_tokenizer(self, **kwargs):
|
@classmethod
|
||||||
return BertJapaneseTokenizer.from_pretrained(self.tmpdirname, subword_tokenizer_type="character", **kwargs)
|
@use_cache_if_possible
|
||||||
|
@lru_cache(maxsize=64)
|
||||||
|
def get_tokenizer(cls, pretrained_name=None, **kwargs):
|
||||||
|
return BertJapaneseTokenizer.from_pretrained(cls.tmpdirname, subword_tokenizer_type="character", **kwargs)
|
||||||
|
|
||||||
def get_input_output_texts(self, tokenizer):
|
def get_input_output_texts(self, tokenizer):
|
||||||
input_text = "こんにちは、世界。 \nこんばんは、世界。"
|
input_text = "こんにちは、世界。 \nこんばんは、世界。"
|
||||||
|
@ -15,10 +15,11 @@
|
|||||||
|
|
||||||
import os
|
import os
|
||||||
import unittest
|
import unittest
|
||||||
|
from functools import lru_cache
|
||||||
|
|
||||||
from transformers.models.bertweet.tokenization_bertweet import VOCAB_FILES_NAMES, BertweetTokenizer
|
from transformers.models.bertweet.tokenization_bertweet import VOCAB_FILES_NAMES, BertweetTokenizer
|
||||||
|
|
||||||
from ...test_tokenization_common import TokenizerTesterMixin
|
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
|
||||||
|
|
||||||
|
|
||||||
class BertweetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
class BertweetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||||
@ -26,26 +27,31 @@ class BertweetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
tokenizer_class = BertweetTokenizer
|
tokenizer_class = BertweetTokenizer
|
||||||
test_rust_tokenizer = False
|
test_rust_tokenizer = False
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
|
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
|
||||||
vocab = ["I", "m", "V@@", "R@@", "r", "e@@"]
|
vocab = ["I", "m", "V@@", "R@@", "r", "e@@"]
|
||||||
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
||||||
merges = ["#version: 0.2", "a m</w>"]
|
merges = ["#version: 0.2", "a m</w>"]
|
||||||
self.special_tokens_map = {"unk_token": "<unk>"}
|
cls.special_tokens_map = {"unk_token": "<unk>"}
|
||||||
|
|
||||||
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
||||||
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
|
cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
|
||||||
with open(self.vocab_file, "w", encoding="utf-8") as fp:
|
with open(cls.vocab_file, "w", encoding="utf-8") as fp:
|
||||||
for token in vocab_tokens:
|
for token in vocab_tokens:
|
||||||
fp.write(f"{token} {vocab_tokens[token]}\n")
|
fp.write(f"{token} {vocab_tokens[token]}\n")
|
||||||
with open(self.merges_file, "w", encoding="utf-8") as fp:
|
with open(cls.merges_file, "w", encoding="utf-8") as fp:
|
||||||
fp.write("\n".join(merges))
|
fp.write("\n".join(merges))
|
||||||
|
|
||||||
def get_tokenizer(self, **kwargs):
|
@classmethod
|
||||||
kwargs.update(self.special_tokens_map)
|
@use_cache_if_possible
|
||||||
return BertweetTokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
@lru_cache(maxsize=64)
|
||||||
|
def get_tokenizer(cls, pretrained_name=None, **kwargs):
|
||||||
|
kwargs.update(cls.special_tokens_map)
|
||||||
|
pretrained_name = pretrained_name or cls.tmpdirname
|
||||||
|
return BertweetTokenizer.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
def get_input_output_texts(self, tokenizer):
|
def get_input_output_texts(self, tokenizer):
|
||||||
input_text = "I am VinAI Research"
|
input_text = "I am VinAI Research"
|
||||||
|
@ -36,11 +36,12 @@ class BigBirdTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
test_rust_tokenizer = True
|
test_rust_tokenizer = True
|
||||||
test_sentencepiece = True
|
test_sentencepiece = True
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
tokenizer = self.tokenizer_class(SAMPLE_VOCAB, keep_accents=True)
|
tokenizer = cls.tokenizer_class(SAMPLE_VOCAB, keep_accents=True)
|
||||||
tokenizer.save_pretrained(self.tmpdirname)
|
tokenizer.save_pretrained(cls.tmpdirname)
|
||||||
|
|
||||||
def test_convert_token_and_id(self):
|
def test_convert_token_and_id(self):
|
||||||
"""Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
|
"""Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
|
||||||
|
@ -30,8 +30,9 @@ class BioGptTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
tokenizer_class = BioGptTokenizer
|
tokenizer_class = BioGptTokenizer
|
||||||
test_rust_tokenizer = False
|
test_rust_tokenizer = False
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
|
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
|
||||||
vocab = [
|
vocab = [
|
||||||
@ -60,11 +61,11 @@ class BioGptTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
||||||
merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""]
|
merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""]
|
||||||
|
|
||||||
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
||||||
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
|
cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
|
||||||
with open(self.vocab_file, "w") as fp:
|
with open(cls.vocab_file, "w") as fp:
|
||||||
fp.write(json.dumps(vocab_tokens))
|
fp.write(json.dumps(vocab_tokens))
|
||||||
with open(self.merges_file, "w") as fp:
|
with open(cls.merges_file, "w") as fp:
|
||||||
fp.write("\n".join(merges))
|
fp.write("\n".join(merges))
|
||||||
|
|
||||||
def get_input_output_texts(self, tokenizer):
|
def get_input_output_texts(self, tokenizer):
|
||||||
|
@ -18,13 +18,14 @@
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import unittest
|
import unittest
|
||||||
|
from functools import lru_cache
|
||||||
|
|
||||||
from transformers.models.blenderbot_small.tokenization_blenderbot_small import (
|
from transformers.models.blenderbot_small.tokenization_blenderbot_small import (
|
||||||
VOCAB_FILES_NAMES,
|
VOCAB_FILES_NAMES,
|
||||||
BlenderbotSmallTokenizer,
|
BlenderbotSmallTokenizer,
|
||||||
)
|
)
|
||||||
|
|
||||||
from ...test_tokenization_common import TokenizerTesterMixin
|
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
|
||||||
|
|
||||||
|
|
||||||
class BlenderbotSmallTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
|
class BlenderbotSmallTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
|
||||||
@ -32,25 +33,30 @@ class BlenderbotSmallTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
tokenizer_class = BlenderbotSmallTokenizer
|
tokenizer_class = BlenderbotSmallTokenizer
|
||||||
test_rust_tokenizer = False
|
test_rust_tokenizer = False
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
vocab = ["__start__", "adapt", "act", "ap@@", "te", "__end__", "__unk__"]
|
vocab = ["__start__", "adapt", "act", "ap@@", "te", "__end__", "__unk__"]
|
||||||
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
||||||
|
|
||||||
merges = ["#version: 0.2", "a p", "t e</w>", "ap t</w>", "a d", "ad apt</w>", "a c", "ac t</w>", ""]
|
merges = ["#version: 0.2", "a p", "t e</w>", "ap t</w>", "a d", "ad apt</w>", "a c", "ac t</w>", ""]
|
||||||
self.special_tokens_map = {"unk_token": "__unk__", "bos_token": "__start__", "eos_token": "__end__"}
|
cls.special_tokens_map = {"unk_token": "__unk__", "bos_token": "__start__", "eos_token": "__end__"}
|
||||||
|
|
||||||
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
||||||
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
|
cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
|
||||||
with open(self.vocab_file, "w", encoding="utf-8") as fp:
|
with open(cls.vocab_file, "w", encoding="utf-8") as fp:
|
||||||
fp.write(json.dumps(vocab_tokens) + "\n")
|
fp.write(json.dumps(vocab_tokens) + "\n")
|
||||||
with open(self.merges_file, "w", encoding="utf-8") as fp:
|
with open(cls.merges_file, "w", encoding="utf-8") as fp:
|
||||||
fp.write("\n".join(merges))
|
fp.write("\n".join(merges))
|
||||||
|
|
||||||
def get_tokenizer(self, **kwargs):
|
@classmethod
|
||||||
kwargs.update(self.special_tokens_map)
|
@use_cache_if_possible
|
||||||
return BlenderbotSmallTokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
@lru_cache(maxsize=64)
|
||||||
|
def get_tokenizer(cls, pretrained_name=None, **kwargs):
|
||||||
|
kwargs.update(cls.special_tokens_map)
|
||||||
|
pretrained_name = pretrained_name or cls.tmpdirname
|
||||||
|
return BlenderbotSmallTokenizer.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
def get_input_output_texts(self, tokenizer):
|
def get_input_output_texts(self, tokenizer):
|
||||||
input_text = "adapt act apte"
|
input_text = "adapt act apte"
|
||||||
|
@ -13,14 +13,16 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
|
import copy
|
||||||
import unittest
|
import unittest
|
||||||
|
from functools import lru_cache
|
||||||
|
|
||||||
from datasets import load_dataset
|
from datasets import load_dataset
|
||||||
|
|
||||||
from transformers import BloomTokenizerFast
|
from transformers import BloomTokenizerFast
|
||||||
from transformers.testing_utils import require_jinja, require_tokenizers
|
from transformers.testing_utils import require_jinja, require_tokenizers
|
||||||
|
|
||||||
from ...test_tokenization_common import TokenizerTesterMixin
|
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
|
||||||
|
|
||||||
|
|
||||||
@require_tokenizers
|
@require_tokenizers
|
||||||
@ -34,14 +36,21 @@ class BloomTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
from_pretrained_vocab_key = "tokenizer_file"
|
from_pretrained_vocab_key = "tokenizer_file"
|
||||||
special_tokens_map = {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
|
special_tokens_map = {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
tokenizer = BloomTokenizerFast.from_pretrained("bigscience/tokenizer")
|
tokenizer = BloomTokenizerFast.from_pretrained("bigscience/tokenizer")
|
||||||
tokenizer.save_pretrained(self.tmpdirname)
|
tokenizer.save_pretrained(cls.tmpdirname)
|
||||||
|
|
||||||
def get_rust_tokenizer(self, **kwargs):
|
@classmethod
|
||||||
kwargs.update(self.special_tokens_map)
|
@use_cache_if_possible
|
||||||
return BloomTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
|
@lru_cache(maxsize=64)
|
||||||
|
def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
|
||||||
|
_kwargs = copy.deepcopy(cls.special_tokens_map)
|
||||||
|
_kwargs.update(kwargs)
|
||||||
|
kwargs = _kwargs
|
||||||
|
pretrained_name = pretrained_name or cls.tmpdirname
|
||||||
|
return BloomTokenizerFast.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
@unittest.skip(reason="This needs a slow tokenizer. Bloom does not have one!")
|
@unittest.skip(reason="This needs a slow tokenizer. Bloom does not have one!")
|
||||||
def test_encode_decode_with_spaces(self):
|
def test_encode_decode_with_spaces(self):
|
||||||
@ -65,7 +74,7 @@ class BloomTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
def test_padding(self, max_length=6):
|
def test_padding(self, max_length=6):
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
# tokenizer_r.pad_token = None # Hotfixing padding = None
|
# tokenizer_r.pad_token = None # Hotfixing padding = None
|
||||||
# Simple input
|
# Simple input
|
||||||
s = "This is a simple input"
|
s = "This is a simple input"
|
||||||
|
@ -19,12 +19,13 @@ import re
|
|||||||
import shutil
|
import shutil
|
||||||
import tempfile
|
import tempfile
|
||||||
import unittest
|
import unittest
|
||||||
|
from functools import lru_cache
|
||||||
from typing import Tuple
|
from typing import Tuple
|
||||||
|
|
||||||
from transformers import AddedToken, BatchEncoding, ByT5Tokenizer
|
from transformers import AddedToken, BatchEncoding, ByT5Tokenizer
|
||||||
from transformers.utils import cached_property, is_tf_available, is_torch_available
|
from transformers.utils import cached_property, is_tf_available, is_torch_available
|
||||||
|
|
||||||
from ...test_tokenization_common import TokenizerTesterMixin
|
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
|
||||||
|
|
||||||
|
|
||||||
if is_torch_available():
|
if is_torch_available():
|
||||||
@ -39,17 +40,22 @@ class ByT5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
tokenizer_class = ByT5Tokenizer
|
tokenizer_class = ByT5Tokenizer
|
||||||
test_rust_tokenizer = False
|
test_rust_tokenizer = False
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
tokenizer = ByT5Tokenizer()
|
tokenizer = ByT5Tokenizer()
|
||||||
tokenizer.save_pretrained(self.tmpdirname)
|
tokenizer.save_pretrained(cls.tmpdirname)
|
||||||
|
|
||||||
@cached_property
|
@cached_property
|
||||||
def t5_base_tokenizer(self):
|
def t5_base_tokenizer(self):
|
||||||
return ByT5Tokenizer.from_pretrained("google/byt5-small")
|
return ByT5Tokenizer.from_pretrained("google/byt5-small")
|
||||||
|
|
||||||
def get_tokenizer(self, **kwargs) -> ByT5Tokenizer:
|
@classmethod
|
||||||
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
|
@use_cache_if_possible
|
||||||
|
@lru_cache(maxsize=64)
|
||||||
|
def get_tokenizer(cls, pretrained_name=None, **kwargs) -> ByT5Tokenizer:
|
||||||
|
pretrained_name = pretrained_name or cls.tmpdirname
|
||||||
|
return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
def get_clean_sequence(self, tokenizer, with_prefix_space=False, max_length=20, min_length=5) -> Tuple[str, list]:
|
def get_clean_sequence(self, tokenizer, with_prefix_space=False, max_length=20, min_length=5) -> Tuple[str, list]:
|
||||||
# XXX The default common tokenizer tests assume that every ID is decodable on its own.
|
# XXX The default common tokenizer tests assume that every ID is decodable on its own.
|
||||||
|
@ -15,6 +15,7 @@
|
|||||||
|
|
||||||
import tempfile
|
import tempfile
|
||||||
import unittest
|
import unittest
|
||||||
|
from tempfile import TemporaryDirectory
|
||||||
|
|
||||||
from transformers import AddedToken, CamembertTokenizer, CamembertTokenizerFast
|
from transformers import AddedToken, CamembertTokenizer, CamembertTokenizerFast
|
||||||
from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, slow
|
from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, slow
|
||||||
@ -38,12 +39,13 @@ class CamembertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
test_rust_tokenizer = True
|
test_rust_tokenizer = True
|
||||||
test_sentencepiece = True
|
test_sentencepiece = True
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
# We have a SentencePiece fixture for testing
|
# We have a SentencePiece fixture for testing
|
||||||
tokenizer = CamembertTokenizer(SAMPLE_VOCAB)
|
tokenizer = CamembertTokenizer(SAMPLE_VOCAB)
|
||||||
tokenizer.save_pretrained(self.tmpdirname)
|
tokenizer.save_pretrained(cls.tmpdirname)
|
||||||
|
|
||||||
@unittest.skip(
|
@unittest.skip(
|
||||||
"Token maps are not equal because someone set the probability of ('<unk>NOTUSED', -100), so it's never encoded for fast"
|
"Token maps are not equal because someone set the probability of ('<unk>NOTUSED', -100), so it's never encoded for fast"
|
||||||
@ -72,8 +74,9 @@ class CamembertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
|
|
||||||
def test_rust_and_python_bpe_tokenizers(self):
|
def test_rust_and_python_bpe_tokenizers(self):
|
||||||
tokenizer = CamembertTokenizer(SAMPLE_BPE_VOCAB)
|
tokenizer = CamembertTokenizer(SAMPLE_BPE_VOCAB)
|
||||||
tokenizer.save_pretrained(self.tmpdirname)
|
with TemporaryDirectory() as tmpdirname:
|
||||||
rust_tokenizer = CamembertTokenizerFast.from_pretrained(self.tmpdirname)
|
tokenizer.save_pretrained(tmpdirname)
|
||||||
|
rust_tokenizer = CamembertTokenizerFast.from_pretrained(tmpdirname)
|
||||||
|
|
||||||
sequence = "I was born in 92000, and this is falsé."
|
sequence = "I was born in 92000, and this is falsé."
|
||||||
|
|
||||||
@ -147,11 +150,11 @@ class CamembertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
self.assertTrue(all(item in tokenizer.added_tokens_decoder.items() for item in expected.items()))
|
self.assertTrue(all(item in tokenizer.added_tokens_decoder.items() for item in expected.items()))
|
||||||
return tokenizer
|
return tokenizer
|
||||||
|
|
||||||
new_eos = AddedToken("[NEW_EOS]", rstrip=False, lstrip=True, normalized=False)
|
new_eos = AddedToken("[NEW_EOS]", rstrip=False, lstrip=True, normalized=False, special=True)
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
# Load a slow tokenizer from the hub, init with the new token for fast to also include it
|
# Load a slow tokenizer from the hub, init with the new token for fast to also include it
|
||||||
tokenizer = self.tokenizer_class.from_pretrained(pretrained_name, eos_token=new_eos)
|
tokenizer = self.get_tokenizer(pretrained_name, eos_token=new_eos)
|
||||||
EXPECTED_ADDED_TOKENS_DECODER = tokenizer.added_tokens_decoder
|
EXPECTED_ADDED_TOKENS_DECODER = tokenizer.added_tokens_decoder
|
||||||
with self.subTest("Hub -> Slow: Test loading a slow tokenizer from the hub)"):
|
with self.subTest("Hub -> Slow: Test loading a slow tokenizer from the hub)"):
|
||||||
self.assertEqual(tokenizer._special_tokens_map["eos_token"], new_eos)
|
self.assertEqual(tokenizer._special_tokens_map["eos_token"], new_eos)
|
||||||
@ -191,9 +194,7 @@ class CamembertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
|
|
||||||
with self.subTest("Hub -> Fast: Test loading a fast tokenizer from the hub)"):
|
with self.subTest("Hub -> Fast: Test loading a fast tokenizer from the hub)"):
|
||||||
if self.rust_tokenizer_class is not None:
|
if self.rust_tokenizer_class is not None:
|
||||||
tokenizer_fast = self.rust_tokenizer_class.from_pretrained(
|
tokenizer_fast = self.get_rust_tokenizer(pretrained_name, eos_token=new_eos, from_slow=True)
|
||||||
pretrained_name, eos_token=new_eos, from_slow=True
|
|
||||||
)
|
|
||||||
self.assertEqual(tokenizer_fast._special_tokens_map["eos_token"], new_eos)
|
self.assertEqual(tokenizer_fast._special_tokens_map["eos_token"], new_eos)
|
||||||
self.assertIn(new_eos, list(tokenizer_fast.added_tokens_decoder.values()))
|
self.assertIn(new_eos, list(tokenizer_fast.added_tokens_decoder.values()))
|
||||||
# We can't test the following because for BC we kept the default rstrip lstrip in slow not fast. Will comment once normalization is alright
|
# We can't test the following because for BC we kept the default rstrip lstrip in slow not fast. Will comment once normalization is alright
|
||||||
|
@ -18,13 +18,14 @@ import os
|
|||||||
import shutil
|
import shutil
|
||||||
import tempfile
|
import tempfile
|
||||||
import unittest
|
import unittest
|
||||||
|
from functools import lru_cache
|
||||||
|
|
||||||
from transformers import BatchEncoding, CanineTokenizer
|
from transformers import BatchEncoding, CanineTokenizer
|
||||||
from transformers.testing_utils import require_tokenizers, require_torch
|
from transformers.testing_utils import require_tokenizers, require_torch
|
||||||
from transformers.tokenization_utils import AddedToken
|
from transformers.tokenization_utils import AddedToken
|
||||||
from transformers.utils import cached_property
|
from transformers.utils import cached_property
|
||||||
|
|
||||||
from ...test_tokenization_common import TokenizerTesterMixin
|
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
|
||||||
|
|
||||||
|
|
||||||
class CanineTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
class CanineTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||||
@ -32,17 +33,22 @@ class CanineTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
tokenizer_class = CanineTokenizer
|
tokenizer_class = CanineTokenizer
|
||||||
test_rust_tokenizer = False
|
test_rust_tokenizer = False
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
tokenizer = CanineTokenizer()
|
tokenizer = CanineTokenizer()
|
||||||
tokenizer.save_pretrained(self.tmpdirname)
|
tokenizer.save_pretrained(cls.tmpdirname)
|
||||||
|
|
||||||
@cached_property
|
@cached_property
|
||||||
def canine_tokenizer(self):
|
def canine_tokenizer(self):
|
||||||
return CanineTokenizer.from_pretrained("google/canine-s")
|
return CanineTokenizer.from_pretrained("google/canine-s")
|
||||||
|
|
||||||
def get_tokenizer(self, **kwargs) -> CanineTokenizer:
|
@classmethod
|
||||||
tokenizer = self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
|
@use_cache_if_possible
|
||||||
|
@lru_cache(maxsize=64)
|
||||||
|
def get_tokenizer(cls, pretrained_name=None, **kwargs) -> CanineTokenizer:
|
||||||
|
pretrained_name = pretrained_name or cls.tmpdirname
|
||||||
|
tokenizer = cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||||
tokenizer._unicode_vocab_size = 1024
|
tokenizer._unicode_vocab_size = 1024
|
||||||
return tokenizer
|
return tokenizer
|
||||||
|
|
||||||
|
@ -17,12 +17,13 @@
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import unittest
|
import unittest
|
||||||
|
from functools import lru_cache
|
||||||
|
|
||||||
from transformers import CLIPTokenizer, CLIPTokenizerFast
|
from transformers import CLIPTokenizer, CLIPTokenizerFast
|
||||||
from transformers.models.clip.tokenization_clip import VOCAB_FILES_NAMES
|
from transformers.models.clip.tokenization_clip import VOCAB_FILES_NAMES
|
||||||
from transformers.testing_utils import require_ftfy, require_tokenizers
|
from transformers.testing_utils import require_ftfy, require_tokenizers
|
||||||
|
|
||||||
from ...test_tokenization_common import TokenizerTesterMixin
|
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
|
||||||
|
|
||||||
|
|
||||||
@require_tokenizers
|
@require_tokenizers
|
||||||
@ -34,28 +35,37 @@ class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
from_pretrained_kwargs = {}
|
from_pretrained_kwargs = {}
|
||||||
test_seq2seq = False
|
test_seq2seq = False
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "lo", "l</w>", "w</w>", "r</w>", "t</w>", "low</w>", "er</w>", "lowest</w>", "newer</w>", "wider", "<unk>", "<|startoftext|>", "<|endoftext|>"] # fmt: skip
|
vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "lo", "l</w>", "w</w>", "r</w>", "t</w>", "low</w>", "er</w>", "lowest</w>", "newer</w>", "wider", "<unk>", "<|startoftext|>", "<|endoftext|>"] # fmt: skip
|
||||||
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
||||||
merges = ["#version: 0.2", "l o", "lo w</w>", "e r</w>"]
|
merges = ["#version: 0.2", "l o", "lo w</w>", "e r</w>"]
|
||||||
self.special_tokens_map = {"unk_token": "<unk>"}
|
cls.special_tokens_map = {"unk_token": "<unk>"}
|
||||||
|
|
||||||
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
||||||
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
|
cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
|
||||||
with open(self.vocab_file, "w", encoding="utf-8") as fp:
|
with open(cls.vocab_file, "w", encoding="utf-8") as fp:
|
||||||
fp.write(json.dumps(vocab_tokens) + "\n")
|
fp.write(json.dumps(vocab_tokens) + "\n")
|
||||||
with open(self.merges_file, "w", encoding="utf-8") as fp:
|
with open(cls.merges_file, "w", encoding="utf-8") as fp:
|
||||||
fp.write("\n".join(merges))
|
fp.write("\n".join(merges))
|
||||||
|
|
||||||
def get_tokenizer(self, **kwargs):
|
@classmethod
|
||||||
kwargs.update(self.special_tokens_map)
|
@use_cache_if_possible
|
||||||
return CLIPTokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
@lru_cache(maxsize=64)
|
||||||
|
def get_tokenizer(cls, pretrained_name=None, **kwargs):
|
||||||
|
kwargs.update(cls.special_tokens_map)
|
||||||
|
pretrained_name = pretrained_name or cls.tmpdirname
|
||||||
|
return CLIPTokenizer.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
def get_rust_tokenizer(self, **kwargs):
|
@classmethod
|
||||||
kwargs.update(self.special_tokens_map)
|
@use_cache_if_possible
|
||||||
return CLIPTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
|
@lru_cache(maxsize=64)
|
||||||
|
def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
|
||||||
|
kwargs.update(cls.special_tokens_map)
|
||||||
|
pretrained_name = pretrained_name or cls.tmpdirname
|
||||||
|
return CLIPTokenizerFast.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
def get_input_output_texts(self, tokenizer):
|
def get_input_output_texts(self, tokenizer):
|
||||||
input_text = "lower newer"
|
input_text = "lower newer"
|
||||||
@ -77,8 +87,8 @@ class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
def test_check_encoding_slow_fast(self):
|
def test_check_encoding_slow_fast(self):
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_s = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_s = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
text = "A\n'll 11p223RF☆ho!!to?'d'd''d of a cat to-$''d."
|
text = "A\n'll 11p223RF☆ho!!to?'d'd''d of a cat to-$''d."
|
||||||
text_tokenized_s = tokenizer_s.tokenize(text)
|
text_tokenized_s = tokenizer_s.tokenize(text)
|
||||||
@ -138,7 +148,7 @@ class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
text_of_1_token = "hello" # `hello` is a token in the vocabulary of `pretrained_name`
|
text_of_1_token = "hello" # `hello` is a token in the vocabulary of `pretrained_name`
|
||||||
text = f"{text_of_1_token} {text_of_1_token}"
|
text = f"{text_of_1_token} {text_of_1_token}"
|
||||||
|
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
|
tokenizer_r = self.get_rust_tokenizer(
|
||||||
pretrained_name,
|
pretrained_name,
|
||||||
use_fast=True,
|
use_fast=True,
|
||||||
)
|
)
|
||||||
@ -151,7 +161,7 @@ class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
|
|
||||||
text = f" {text}"
|
text = f" {text}"
|
||||||
|
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
|
tokenizer_r = self.get_rust_tokenizer(
|
||||||
pretrained_name,
|
pretrained_name,
|
||||||
use_fast=True,
|
use_fast=True,
|
||||||
)
|
)
|
||||||
@ -166,7 +176,7 @@ class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
# Test related to the breaking change introduced in transformers v4.17.0
|
# Test related to the breaking change introduced in transformers v4.17.0
|
||||||
# We need to check that an error in raised when the user try to load a previous version of the tokenizer.
|
# We need to check that an error in raised when the user try to load a previous version of the tokenizer.
|
||||||
with self.assertRaises(ValueError) as context:
|
with self.assertRaises(ValueError) as context:
|
||||||
self.rust_tokenizer_class.from_pretrained("robot-test/old-clip-tokenizer")
|
self.get_rust_tokenizer("robot-test/old-clip-tokenizer")
|
||||||
|
|
||||||
self.assertTrue(
|
self.assertTrue(
|
||||||
context.exception.args[0].startswith(
|
context.exception.args[0].startswith(
|
||||||
|
@ -17,11 +17,12 @@
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import unittest
|
import unittest
|
||||||
|
from functools import lru_cache
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from transformers import ClvpTokenizer
|
from transformers import ClvpTokenizer
|
||||||
|
|
||||||
from ...test_tokenization_common import TokenizerTesterMixin, slow
|
from ...test_tokenization_common import TokenizerTesterMixin, slow, use_cache_if_possible
|
||||||
|
|
||||||
|
|
||||||
class ClvpTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
class ClvpTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||||
@ -32,8 +33,9 @@ class ClvpTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
test_seq2seq = False
|
test_seq2seq = False
|
||||||
test_sentencepiece_ignore_case = True
|
test_sentencepiece_ignore_case = True
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
|
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
|
||||||
vocab = [
|
vocab = [
|
||||||
@ -62,19 +64,23 @@ class ClvpTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
]
|
]
|
||||||
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
||||||
merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
|
merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
|
||||||
self.special_tokens_map = {"unk_token": "<unk>"}
|
cls.special_tokens_map = {"unk_token": "<unk>"}
|
||||||
|
|
||||||
self.vocab_file = os.path.join(self.tmpdirname, "vocab.json")
|
cls.vocab_file = os.path.join(cls.tmpdirname, "vocab.json")
|
||||||
self.merges_file = os.path.join(self.tmpdirname, "merges.txt")
|
cls.merges_file = os.path.join(cls.tmpdirname, "merges.txt")
|
||||||
with open(self.vocab_file, "w", encoding="utf-8") as fp:
|
with open(cls.vocab_file, "w", encoding="utf-8") as fp:
|
||||||
fp.write(json.dumps(vocab_tokens) + "\n")
|
fp.write(json.dumps(vocab_tokens) + "\n")
|
||||||
with open(self.merges_file, "w", encoding="utf-8") as fp:
|
with open(cls.merges_file, "w", encoding="utf-8") as fp:
|
||||||
fp.write("\n".join(merges))
|
fp.write("\n".join(merges))
|
||||||
|
|
||||||
# Copied from transformers.tests.models.gpt2.test_tokenization_gpt2.GPT2TokenizationTest.get_tokenizer with GPT2->Clvp
|
# Copied from transformers.tests.models.gpt2.test_tokenization_gpt2.GPT2TokenizationTest.get_tokenizer with GPT2->Clvp
|
||||||
def get_tokenizer(self, **kwargs):
|
@classmethod
|
||||||
kwargs.update(self.special_tokens_map)
|
@use_cache_if_possible
|
||||||
return ClvpTokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
@lru_cache(maxsize=64)
|
||||||
|
def get_tokenizer(cls, pretrained_name=None, **kwargs):
|
||||||
|
kwargs.update(cls.special_tokens_map)
|
||||||
|
pretrained_name = pretrained_name or cls.tmpdirname
|
||||||
|
return ClvpTokenizer.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
# Copied from transformers.tests.models.gpt2.test_tokenization_gpt2.GPT2TokenizationTest.get_input_output_texts
|
# Copied from transformers.tests.models.gpt2.test_tokenization_gpt2.GPT2TokenizationTest.get_input_output_texts
|
||||||
def get_input_output_texts(self, tokenizer):
|
def get_input_output_texts(self, tokenizer):
|
||||||
@ -134,7 +140,7 @@ class ClvpTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
def test_padding(self, max_length=15):
|
def test_padding(self, max_length=15):
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
# Simple input
|
# Simple input
|
||||||
s = "This is a simple input"
|
s = "This is a simple input"
|
||||||
|
@ -53,15 +53,16 @@ class CodeLlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
test_sentencepiece = True
|
test_sentencepiece = True
|
||||||
from_pretrained_kwargs = {}
|
from_pretrained_kwargs = {}
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
# We have a SentencePiece fixture for testing
|
# We have a SentencePiece fixture for testing
|
||||||
tokenizer = CodeLlamaTokenizer(SAMPLE_VOCAB, keep_accents=True)
|
tokenizer = CodeLlamaTokenizer(SAMPLE_VOCAB, keep_accents=True)
|
||||||
tokenizer.pad_token = tokenizer.eos_token
|
tokenizer.pad_token = tokenizer.eos_token
|
||||||
tokenizer.save_pretrained(self.tmpdirname)
|
tokenizer.save_pretrained(cls.tmpdirname)
|
||||||
|
|
||||||
def get_tokenizers(self, **kwargs):
|
def get_tokenizers(cls, **kwargs):
|
||||||
kwargs.update({"pad_token": "<PAD>"})
|
kwargs.update({"pad_token": "<PAD>"})
|
||||||
return super().get_tokenizers(**kwargs)
|
return super().get_tokenizers(**kwargs)
|
||||||
|
|
||||||
@ -151,8 +152,8 @@ class CodeLlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
]
|
]
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
tmpdirname2 = tempfile.mkdtemp()
|
tmpdirname2 = tempfile.mkdtemp()
|
||||||
|
|
||||||
@ -255,7 +256,7 @@ class CodeLlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
added_tokens = [AddedToken("<special>", lstrip=True)]
|
added_tokens = [AddedToken("<special>", lstrip=True)]
|
||||||
|
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
|
tokenizer_r = self.get_rust_tokenizer(
|
||||||
pretrained_name, additional_special_tokens=added_tokens, **kwargs
|
pretrained_name, additional_special_tokens=added_tokens, **kwargs
|
||||||
)
|
)
|
||||||
r_output = tokenizer_r.encode("Hey this is a <special> token")
|
r_output = tokenizer_r.encode("Hey this is a <special> token")
|
||||||
@ -265,7 +266,7 @@ class CodeLlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
self.assertTrue(special_token_id in r_output)
|
self.assertTrue(special_token_id in r_output)
|
||||||
|
|
||||||
if self.test_slow_tokenizer:
|
if self.test_slow_tokenizer:
|
||||||
tokenizer_cr = self.rust_tokenizer_class.from_pretrained(
|
tokenizer_cr = self.get_rust_tokenizer(
|
||||||
pretrained_name,
|
pretrained_name,
|
||||||
additional_special_tokens=added_tokens,
|
additional_special_tokens=added_tokens,
|
||||||
**kwargs, # , from_slow=True <- unfortunately too slow to convert
|
**kwargs, # , from_slow=True <- unfortunately too slow to convert
|
||||||
|
@ -18,12 +18,13 @@ import json
|
|||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import unittest
|
import unittest
|
||||||
|
from functools import lru_cache
|
||||||
|
|
||||||
from transformers import CodeGenTokenizer, CodeGenTokenizerFast
|
from transformers import CodeGenTokenizer, CodeGenTokenizerFast
|
||||||
from transformers.models.codegen.tokenization_codegen import VOCAB_FILES_NAMES
|
from transformers.models.codegen.tokenization_codegen import VOCAB_FILES_NAMES
|
||||||
from transformers.testing_utils import require_tokenizers, slow
|
from transformers.testing_utils import require_tokenizers, slow
|
||||||
|
|
||||||
from ...test_tokenization_common import TokenizerTesterMixin
|
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
|
||||||
|
|
||||||
|
|
||||||
@require_tokenizers
|
@require_tokenizers
|
||||||
@ -35,8 +36,9 @@ class CodeGenTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
from_pretrained_kwargs = {"add_prefix_space": True}
|
from_pretrained_kwargs = {"add_prefix_space": True}
|
||||||
test_seq2seq = False
|
test_seq2seq = False
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
|
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
|
||||||
vocab = [
|
vocab = [
|
||||||
@ -64,22 +66,30 @@ class CodeGenTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
]
|
]
|
||||||
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
||||||
merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
|
merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
|
||||||
self.special_tokens_map = {"unk_token": "<unk>"}
|
cls.special_tokens_map = {"unk_token": "<unk>"}
|
||||||
|
|
||||||
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
||||||
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
|
cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
|
||||||
with open(self.vocab_file, "w", encoding="utf-8") as fp:
|
with open(cls.vocab_file, "w", encoding="utf-8") as fp:
|
||||||
fp.write(json.dumps(vocab_tokens) + "\n")
|
fp.write(json.dumps(vocab_tokens) + "\n")
|
||||||
with open(self.merges_file, "w", encoding="utf-8") as fp:
|
with open(cls.merges_file, "w", encoding="utf-8") as fp:
|
||||||
fp.write("\n".join(merges))
|
fp.write("\n".join(merges))
|
||||||
|
|
||||||
def get_tokenizer(self, **kwargs):
|
@classmethod
|
||||||
kwargs.update(self.special_tokens_map)
|
@use_cache_if_possible
|
||||||
return CodeGenTokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
@lru_cache(maxsize=64)
|
||||||
|
def get_tokenizer(cls, pretrained_name=None, **kwargs):
|
||||||
|
kwargs.update(cls.special_tokens_map)
|
||||||
|
pretrained_name = pretrained_name or cls.tmpdirname
|
||||||
|
return CodeGenTokenizer.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
def get_rust_tokenizer(self, **kwargs):
|
@classmethod
|
||||||
kwargs.update(self.special_tokens_map)
|
@use_cache_if_possible
|
||||||
return CodeGenTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
|
@lru_cache(maxsize=64)
|
||||||
|
def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
|
||||||
|
kwargs.update(cls.special_tokens_map)
|
||||||
|
pretrained_name = pretrained_name or cls.tmpdirname
|
||||||
|
return CodeGenTokenizerFast.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
def get_input_output_texts(self, tokenizer):
|
def get_input_output_texts(self, tokenizer):
|
||||||
input_text = "lower newer"
|
input_text = "lower newer"
|
||||||
@ -136,7 +146,7 @@ class CodeGenTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
def test_padding(self, max_length=15):
|
def test_padding(self, max_length=15):
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
# Simple input
|
# Simple input
|
||||||
s = "This is a simple input"
|
s = "This is a simple input"
|
||||||
|
@ -13,12 +13,14 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
|
import copy
|
||||||
import unittest
|
import unittest
|
||||||
|
from functools import lru_cache
|
||||||
|
|
||||||
from transformers import CohereTokenizerFast
|
from transformers import CohereTokenizerFast
|
||||||
from transformers.testing_utils import require_jinja, require_tokenizers, require_torch_multi_gpu
|
from transformers.testing_utils import require_jinja, require_tokenizers, require_torch_multi_gpu
|
||||||
|
|
||||||
from ...test_tokenization_common import TokenizerTesterMixin
|
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
|
||||||
|
|
||||||
|
|
||||||
@require_tokenizers
|
@require_tokenizers
|
||||||
@ -37,14 +39,21 @@ class CohereTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
"pad_token": "<PAD>",
|
"pad_token": "<PAD>",
|
||||||
}
|
}
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
tokenizer = CohereTokenizerFast.from_pretrained("hf-internal-testing/tiny-random-CohereForCausalLM")
|
tokenizer = CohereTokenizerFast.from_pretrained("hf-internal-testing/tiny-random-CohereForCausalLM")
|
||||||
tokenizer.save_pretrained(self.tmpdirname)
|
tokenizer.save_pretrained(cls.tmpdirname)
|
||||||
|
|
||||||
def get_rust_tokenizer(self, **kwargs):
|
@classmethod
|
||||||
kwargs.update(self.special_tokens_map)
|
@use_cache_if_possible
|
||||||
return CohereTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
|
@lru_cache(maxsize=64)
|
||||||
|
def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
|
||||||
|
_kwargs = copy.deepcopy(cls.special_tokens_map)
|
||||||
|
_kwargs.update(kwargs)
|
||||||
|
kwargs = _kwargs
|
||||||
|
pretrained_name = pretrained_name or cls.tmpdirname
|
||||||
|
return CohereTokenizerFast.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
# This gives CPU OOM on a single-gpu runner (~60G RAM). On multi-gpu runner, it has ~180G RAM which is enough.
|
# This gives CPU OOM on a single-gpu runner (~60G RAM). On multi-gpu runner, it has ~180G RAM which is enough.
|
||||||
@require_torch_multi_gpu
|
@require_torch_multi_gpu
|
||||||
@ -80,7 +89,7 @@ class CohereTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
def test_padding(self, max_length=10):
|
def test_padding(self, max_length=10):
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
# tokenizer_r.pad_token = None # Hotfixing padding = None
|
# tokenizer_r.pad_token = None # Hotfixing padding = None
|
||||||
# Simple input
|
# Simple input
|
||||||
s = "This is a simple input"
|
s = "This is a simple input"
|
||||||
|
@ -28,8 +28,9 @@ class CPMAntTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
tokenizer_class = CpmAntTokenizer
|
tokenizer_class = CpmAntTokenizer
|
||||||
test_rust_tokenizer = False
|
test_rust_tokenizer = False
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
vocab_tokens = [
|
vocab_tokens = [
|
||||||
"<d>",
|
"<d>",
|
||||||
@ -49,8 +50,8 @@ class CPMAntTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
"n",
|
"n",
|
||||||
"t",
|
"t",
|
||||||
]
|
]
|
||||||
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
||||||
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
|
with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
|
||||||
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
|
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
|
||||||
|
|
||||||
@tooslow
|
@tooslow
|
||||||
|
@ -16,10 +16,11 @@
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import unittest
|
import unittest
|
||||||
|
from functools import lru_cache
|
||||||
|
|
||||||
from transformers.models.ctrl.tokenization_ctrl import VOCAB_FILES_NAMES, CTRLTokenizer
|
from transformers.models.ctrl.tokenization_ctrl import VOCAB_FILES_NAMES, CTRLTokenizer
|
||||||
|
|
||||||
from ...test_tokenization_common import TokenizerTesterMixin
|
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
|
||||||
|
|
||||||
|
|
||||||
class CTRLTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
class CTRLTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||||
@ -28,25 +29,30 @@ class CTRLTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
test_rust_tokenizer = False
|
test_rust_tokenizer = False
|
||||||
test_seq2seq = False
|
test_seq2seq = False
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
|
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
|
||||||
vocab = ["adapt", "re@@", "a@@", "apt", "c@@", "t", "<unk>"]
|
vocab = ["adapt", "re@@", "a@@", "apt", "c@@", "t", "<unk>"]
|
||||||
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
||||||
merges = ["#version: 0.2", "a p", "ap t</w>", "r e", "a d", "ad apt</w>", ""]
|
merges = ["#version: 0.2", "a p", "ap t</w>", "r e", "a d", "ad apt</w>", ""]
|
||||||
self.special_tokens_map = {"unk_token": "<unk>"}
|
cls.special_tokens_map = {"unk_token": "<unk>"}
|
||||||
|
|
||||||
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
||||||
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
|
cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
|
||||||
with open(self.vocab_file, "w", encoding="utf-8") as fp:
|
with open(cls.vocab_file, "w", encoding="utf-8") as fp:
|
||||||
fp.write(json.dumps(vocab_tokens) + "\n")
|
fp.write(json.dumps(vocab_tokens) + "\n")
|
||||||
with open(self.merges_file, "w", encoding="utf-8") as fp:
|
with open(cls.merges_file, "w", encoding="utf-8") as fp:
|
||||||
fp.write("\n".join(merges))
|
fp.write("\n".join(merges))
|
||||||
|
|
||||||
def get_tokenizer(self, **kwargs):
|
@classmethod
|
||||||
kwargs.update(self.special_tokens_map)
|
@use_cache_if_possible
|
||||||
return CTRLTokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
@lru_cache(maxsize=64)
|
||||||
|
def get_tokenizer(cls, pretrained_name=None, **kwargs):
|
||||||
|
kwargs.update(cls.special_tokens_map)
|
||||||
|
pretrained_name = pretrained_name or cls.tmpdirname
|
||||||
|
return CTRLTokenizer.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
def get_input_output_texts(self, tokenizer):
|
def get_input_output_texts(self, tokenizer):
|
||||||
input_text = "adapt react readapt apt"
|
input_text = "adapt react readapt apt"
|
||||||
|
@ -17,12 +17,13 @@
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import unittest
|
import unittest
|
||||||
|
from functools import lru_cache
|
||||||
|
|
||||||
from transformers import DebertaTokenizer, DebertaTokenizerFast
|
from transformers import DebertaTokenizer, DebertaTokenizerFast
|
||||||
from transformers.models.deberta.tokenization_deberta import VOCAB_FILES_NAMES
|
from transformers.models.deberta.tokenization_deberta import VOCAB_FILES_NAMES
|
||||||
from transformers.testing_utils import slow
|
from transformers.testing_utils import slow
|
||||||
|
|
||||||
from ...test_tokenization_common import TokenizerTesterMixin
|
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
|
||||||
|
|
||||||
|
|
||||||
class DebertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
class DebertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||||
@ -31,8 +32,9 @@ class DebertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
test_rust_tokenizer = True
|
test_rust_tokenizer = True
|
||||||
rust_tokenizer_class = DebertaTokenizerFast
|
rust_tokenizer_class = DebertaTokenizerFast
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
|
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
|
||||||
vocab = [
|
vocab = [
|
||||||
@ -59,18 +61,22 @@ class DebertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
]
|
]
|
||||||
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
||||||
merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
|
merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
|
||||||
self.special_tokens_map = {"unk_token": "[UNK]"}
|
cls.special_tokens_map = {"unk_token": "[UNK]"}
|
||||||
|
|
||||||
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
||||||
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
|
cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
|
||||||
with open(self.vocab_file, "w", encoding="utf-8") as fp:
|
with open(cls.vocab_file, "w", encoding="utf-8") as fp:
|
||||||
fp.write(json.dumps(vocab_tokens) + "\n")
|
fp.write(json.dumps(vocab_tokens) + "\n")
|
||||||
with open(self.merges_file, "w", encoding="utf-8") as fp:
|
with open(cls.merges_file, "w", encoding="utf-8") as fp:
|
||||||
fp.write("\n".join(merges))
|
fp.write("\n".join(merges))
|
||||||
|
|
||||||
def get_tokenizer(self, **kwargs):
|
@classmethod
|
||||||
kwargs.update(self.special_tokens_map)
|
@use_cache_if_possible
|
||||||
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
|
@lru_cache(maxsize=64)
|
||||||
|
def get_tokenizer(cls, pretrained_name=None, **kwargs):
|
||||||
|
kwargs.update(cls.special_tokens_map)
|
||||||
|
pretrained_name = pretrained_name or cls.tmpdirname
|
||||||
|
return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
def get_input_output_texts(self, tokenizer):
|
def get_input_output_texts(self, tokenizer):
|
||||||
input_text = "lower newer"
|
input_text = "lower newer"
|
||||||
|
@ -33,12 +33,13 @@ class DebertaV2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
test_sentencepiece = True
|
test_sentencepiece = True
|
||||||
test_sentencepiece_ignore_case = True
|
test_sentencepiece_ignore_case = True
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
# We have a SentencePiece fixture for testing
|
# We have a SentencePiece fixture for testing
|
||||||
tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="<unk>")
|
tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="<unk>")
|
||||||
tokenizer.save_pretrained(self.tmpdirname)
|
tokenizer.save_pretrained(cls.tmpdirname)
|
||||||
|
|
||||||
def get_input_output_texts(self, tokenizer):
|
def get_input_output_texts(self, tokenizer):
|
||||||
input_text = "this is a test"
|
input_text = "this is a test"
|
||||||
|
@ -17,11 +17,11 @@
|
|||||||
from transformers import DistilBertTokenizer, DistilBertTokenizerFast
|
from transformers import DistilBertTokenizer, DistilBertTokenizerFast
|
||||||
from transformers.testing_utils import require_tokenizers, slow
|
from transformers.testing_utils import require_tokenizers, slow
|
||||||
|
|
||||||
from ..bert.test_tokenization_bert import BertTokenizationTest
|
from ..bert import test_tokenization_bert
|
||||||
|
|
||||||
|
|
||||||
@require_tokenizers
|
@require_tokenizers
|
||||||
class DistilBertTokenizationTest(BertTokenizationTest):
|
class DistilBertTokenizationTest(test_tokenization_bert.BertTokenizationTest):
|
||||||
tokenizer_class = DistilBertTokenizer
|
tokenizer_class = DistilBertTokenizer
|
||||||
rust_tokenizer_class = DistilBertTokenizerFast
|
rust_tokenizer_class = DistilBertTokenizerFast
|
||||||
test_rust_tokenizer = True
|
test_rust_tokenizer = True
|
||||||
|
@ -25,11 +25,11 @@ from transformers import (
|
|||||||
from transformers.testing_utils import require_tokenizers, slow
|
from transformers.testing_utils import require_tokenizers, slow
|
||||||
from transformers.tokenization_utils_base import BatchEncoding
|
from transformers.tokenization_utils_base import BatchEncoding
|
||||||
|
|
||||||
from ..bert.test_tokenization_bert import BertTokenizationTest
|
from ..bert import test_tokenization_bert
|
||||||
|
|
||||||
|
|
||||||
@require_tokenizers
|
@require_tokenizers
|
||||||
class DPRContextEncoderTokenizationTest(BertTokenizationTest):
|
class DPRContextEncoderTokenizationTest(test_tokenization_bert.BertTokenizationTest):
|
||||||
tokenizer_class = DPRContextEncoderTokenizer
|
tokenizer_class = DPRContextEncoderTokenizer
|
||||||
rust_tokenizer_class = DPRContextEncoderTokenizerFast
|
rust_tokenizer_class = DPRContextEncoderTokenizerFast
|
||||||
test_rust_tokenizer = True
|
test_rust_tokenizer = True
|
||||||
@ -37,7 +37,7 @@ class DPRContextEncoderTokenizationTest(BertTokenizationTest):
|
|||||||
|
|
||||||
|
|
||||||
@require_tokenizers
|
@require_tokenizers
|
||||||
class DPRQuestionEncoderTokenizationTest(BertTokenizationTest):
|
class DPRQuestionEncoderTokenizationTest(test_tokenization_bert.BertTokenizationTest):
|
||||||
tokenizer_class = DPRQuestionEncoderTokenizer
|
tokenizer_class = DPRQuestionEncoderTokenizer
|
||||||
rust_tokenizer_class = DPRQuestionEncoderTokenizerFast
|
rust_tokenizer_class = DPRQuestionEncoderTokenizerFast
|
||||||
test_rust_tokenizer = True
|
test_rust_tokenizer = True
|
||||||
@ -45,7 +45,7 @@ class DPRQuestionEncoderTokenizationTest(BertTokenizationTest):
|
|||||||
|
|
||||||
|
|
||||||
@require_tokenizers
|
@require_tokenizers
|
||||||
class DPRReaderTokenizationTest(BertTokenizationTest):
|
class DPRReaderTokenizationTest(test_tokenization_bert.BertTokenizationTest):
|
||||||
tokenizer_class = DPRReaderTokenizer
|
tokenizer_class = DPRReaderTokenizer
|
||||||
rust_tokenizer_class = DPRReaderTokenizerFast
|
rust_tokenizer_class = DPRReaderTokenizerFast
|
||||||
test_rust_tokenizer = True
|
test_rust_tokenizer = True
|
||||||
|
@ -40,8 +40,9 @@ class ElectraTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
space_between_special_tokens = True
|
space_between_special_tokens = True
|
||||||
from_pretrained_filter = filter_non_english
|
from_pretrained_filter = filter_non_english
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
vocab_tokens = [
|
vocab_tokens = [
|
||||||
"[UNK]",
|
"[UNK]",
|
||||||
@ -60,8 +61,8 @@ class ElectraTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
"low",
|
"low",
|
||||||
"lowest",
|
"lowest",
|
||||||
]
|
]
|
||||||
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
||||||
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
|
with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
|
||||||
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
|
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
|
||||||
|
|
||||||
def get_input_output_texts(self, tokenizer):
|
def get_input_output_texts(self, tokenizer):
|
||||||
@ -250,7 +251,7 @@ class ElectraTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
def test_offsets_with_special_characters(self):
|
def test_offsets_with_special_characters(self):
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence."
|
sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence."
|
||||||
tokens = tokenizer_r.encode_plus(
|
tokens = tokenizer_r.encode_plus(
|
||||||
@ -305,8 +306,8 @@ class ElectraTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
kwargs["tokenize_chinese_chars"] = True
|
kwargs["tokenize_chinese_chars"] = True
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)
|
ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)
|
||||||
ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
|
ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
|
||||||
@ -319,8 +320,8 @@ class ElectraTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
self.assertListEqual(tokens_without_spe_char_r, list_of_commun_chinese_char)
|
self.assertListEqual(tokens_without_spe_char_r, list_of_commun_chinese_char)
|
||||||
|
|
||||||
kwargs["tokenize_chinese_chars"] = False
|
kwargs["tokenize_chinese_chars"] = False
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
|
ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
|
||||||
ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)
|
ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)
|
||||||
|
@ -17,6 +17,7 @@
|
|||||||
import os
|
import os
|
||||||
import tempfile
|
import tempfile
|
||||||
import unittest
|
import unittest
|
||||||
|
from functools import lru_cache
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from transformers.models.esm.tokenization_esm import VOCAB_FILES_NAMES, EsmTokenizer
|
from transformers.models.esm.tokenization_esm import VOCAB_FILES_NAMES, EsmTokenizer
|
||||||
@ -24,24 +25,32 @@ from transformers.testing_utils import require_tokenizers
|
|||||||
from transformers.tokenization_utils import PreTrainedTokenizer
|
from transformers.tokenization_utils import PreTrainedTokenizer
|
||||||
from transformers.tokenization_utils_base import PreTrainedTokenizerBase
|
from transformers.tokenization_utils_base import PreTrainedTokenizerBase
|
||||||
|
|
||||||
|
from ...test_tokenization_common import use_cache_if_possible
|
||||||
|
|
||||||
|
|
||||||
@require_tokenizers
|
@require_tokenizers
|
||||||
class ESMTokenizationTest(unittest.TestCase):
|
class ESMTokenizationTest(unittest.TestCase):
|
||||||
tokenizer_class = EsmTokenizer
|
tokenizer_class = EsmTokenizer
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
self.tmpdirname = tempfile.mkdtemp()
|
super().setUpClass()
|
||||||
|
|
||||||
|
cls.tmpdirname = tempfile.mkdtemp()
|
||||||
vocab_tokens: List[str] = ["<cls>", "<pad>", "<eos>", "<unk>", "L", "A", "G", "V", "S", "E", "R", "T", "I", "D", "P", "K", "Q", "N", "F", "Y", "M", "H", "W", "C", "X", "B", "U", "Z", "O", ".", "-", "<null_1>", "<mask>"] # fmt: skip
|
vocab_tokens: List[str] = ["<cls>", "<pad>", "<eos>", "<unk>", "L", "A", "G", "V", "S", "E", "R", "T", "I", "D", "P", "K", "Q", "N", "F", "Y", "M", "H", "W", "C", "X", "B", "U", "Z", "O", ".", "-", "<null_1>", "<mask>"] # fmt: skip
|
||||||
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
||||||
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
|
with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
|
||||||
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
|
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
|
||||||
|
|
||||||
def get_tokenizers(self, **kwargs) -> List[PreTrainedTokenizerBase]:
|
def get_tokenizers(cls, **kwargs) -> List[PreTrainedTokenizerBase]:
|
||||||
return [self.get_tokenizer(**kwargs)]
|
return [cls.get_tokenizer(**kwargs)]
|
||||||
|
|
||||||
def get_tokenizer(self, **kwargs) -> PreTrainedTokenizer:
|
@classmethod
|
||||||
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
|
@use_cache_if_possible
|
||||||
|
@lru_cache(maxsize=64)
|
||||||
|
def get_tokenizer(cls, pretrained_name=None, **kwargs) -> PreTrainedTokenizer:
|
||||||
|
pretrained_name = pretrained_name or cls.tmpdirname
|
||||||
|
return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
def test_tokenizer_single_example(self):
|
def test_tokenizer_single_example(self):
|
||||||
tokenizer = self.tokenizer_class(self.vocab_file)
|
tokenizer = self.tokenizer_class(self.vocab_file)
|
||||||
|
@ -28,10 +28,11 @@ class FastSpeech2ConformerTokenizerTest(TokenizerTesterMixin, unittest.TestCase)
|
|||||||
tokenizer_class = FastSpeech2ConformerTokenizer
|
tokenizer_class = FastSpeech2ConformerTokenizer
|
||||||
test_rust_tokenizer = False
|
test_rust_tokenizer = False
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
tokenizer = FastSpeech2ConformerTokenizer.from_pretrained("espnet/fastspeech2_conformer")
|
tokenizer = FastSpeech2ConformerTokenizer.from_pretrained("espnet/fastspeech2_conformer")
|
||||||
tokenizer.save_pretrained(self.tmpdirname)
|
tokenizer.save_pretrained(cls.tmpdirname)
|
||||||
|
|
||||||
def get_input_output_texts(self, tokenizer):
|
def get_input_output_texts(self, tokenizer):
|
||||||
input_text = "this is a test"
|
input_text = "this is a test"
|
||||||
|
@ -30,8 +30,9 @@ class FlaubertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
tokenizer_class = FlaubertTokenizer
|
tokenizer_class = FlaubertTokenizer
|
||||||
test_rust_tokenizer = False
|
test_rust_tokenizer = False
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
|
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
|
||||||
vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "w</w>", "r</w>", "t</w>", "i</w>", "lo", "low", "ne", "new", "er</w>", "low</w>", "lowest</w>", "new</w>", "newer</w>", "wider</w>", "<unk>"] # fmt: skip
|
vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "w</w>", "r</w>", "t</w>", "i</w>", "lo", "low", "ne", "new", "er</w>", "low</w>", "lowest</w>", "new</w>", "newer</w>", "wider</w>", "<unk>"] # fmt: skip
|
||||||
@ -39,11 +40,11 @@ class FlaubertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
||||||
merges = ["n e 300", "ne w 301", "e r</w> 302", ""]
|
merges = ["n e 300", "ne w 301", "e r</w> 302", ""]
|
||||||
|
|
||||||
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
||||||
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
|
cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
|
||||||
with open(self.vocab_file, "w", encoding="utf-8") as fp:
|
with open(cls.vocab_file, "w", encoding="utf-8") as fp:
|
||||||
fp.write(json.dumps(vocab_tokens) + "\n")
|
fp.write(json.dumps(vocab_tokens) + "\n")
|
||||||
with open(self.merges_file, "w", encoding="utf-8") as fp:
|
with open(cls.merges_file, "w", encoding="utf-8") as fp:
|
||||||
fp.write("\n".join(merges))
|
fp.write("\n".join(merges))
|
||||||
|
|
||||||
# Copied from transformers.tests.models.xlm.test_tokenization_xlm.XLMTokenizationTest.test_full_tokenizer
|
# Copied from transformers.tests.models.xlm.test_tokenization_xlm.XLMTokenizationTest.test_full_tokenizer
|
||||||
|
@ -36,12 +36,13 @@ class FNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
test_sentencepiece_ignore_case = True
|
test_sentencepiece_ignore_case = True
|
||||||
test_seq2seq = False
|
test_seq2seq = False
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
# We have a SentencePiece fixture for testing
|
# We have a SentencePiece fixture for testing
|
||||||
tokenizer = FNetTokenizer(SAMPLE_VOCAB)
|
tokenizer = FNetTokenizer(SAMPLE_VOCAB)
|
||||||
tokenizer.save_pretrained(self.tmpdirname)
|
tokenizer.save_pretrained(cls.tmpdirname)
|
||||||
|
|
||||||
def get_input_output_texts(self, tokenizer):
|
def get_input_output_texts(self, tokenizer):
|
||||||
input_text = "this is a test"
|
input_text = "this is a test"
|
||||||
@ -147,7 +148,7 @@ class FNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
added_tokens = [AddedToken("<special>", lstrip=True)]
|
added_tokens = [AddedToken("<special>", lstrip=True)]
|
||||||
|
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
|
tokenizer_r = self.get_rust_tokenizer(
|
||||||
pretrained_name, additional_special_tokens=added_tokens, **kwargs
|
pretrained_name, additional_special_tokens=added_tokens, **kwargs
|
||||||
)
|
)
|
||||||
r_output = tokenizer_r.encode("Hey this is a <special> token")
|
r_output = tokenizer_r.encode("Hey this is a <special> token")
|
||||||
@ -175,7 +176,7 @@ class FNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
added_tokens = [AddedToken("<special>", lstrip=True)]
|
added_tokens = [AddedToken("<special>", lstrip=True)]
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
|
tokenizer_r = self.get_rust_tokenizer(
|
||||||
pretrained_name, additional_special_tokens=added_tokens, **kwargs, from_slow=True
|
pretrained_name, additional_special_tokens=added_tokens, **kwargs, from_slow=True
|
||||||
)
|
)
|
||||||
special_token_id = tokenizer_r.encode("<special>", add_special_tokens=False)[0]
|
special_token_id = tokenizer_r.encode("<special>", add_special_tokens=False)[0]
|
||||||
@ -198,8 +199,8 @@ class FNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
|
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
|
self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
|
||||||
pad_token_id = tokenizer_p.pad_token_id
|
pad_token_id = tokenizer_p.pad_token_id
|
||||||
|
@ -34,8 +34,9 @@ class FSMTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
tokenizer_class = FSMTTokenizer
|
tokenizer_class = FSMTTokenizer
|
||||||
test_rust_tokenizer = False
|
test_rust_tokenizer = False
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
|
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
|
||||||
vocab = [
|
vocab = [
|
||||||
@ -64,22 +65,22 @@ class FSMTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
||||||
merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""]
|
merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""]
|
||||||
|
|
||||||
self.langs = ["en", "ru"]
|
cls.langs = ["en", "ru"]
|
||||||
config = {
|
config = {
|
||||||
"langs": self.langs,
|
"langs": cls.langs,
|
||||||
"src_vocab_size": 10,
|
"src_vocab_size": 10,
|
||||||
"tgt_vocab_size": 20,
|
"tgt_vocab_size": 20,
|
||||||
}
|
}
|
||||||
|
|
||||||
self.src_vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["src_vocab_file"])
|
cls.src_vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["src_vocab_file"])
|
||||||
self.tgt_vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["tgt_vocab_file"])
|
cls.tgt_vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["tgt_vocab_file"])
|
||||||
config_file = os.path.join(self.tmpdirname, "tokenizer_config.json")
|
config_file = os.path.join(cls.tmpdirname, "tokenizer_config.json")
|
||||||
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
|
cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
|
||||||
with open(self.src_vocab_file, "w") as fp:
|
with open(cls.src_vocab_file, "w") as fp:
|
||||||
fp.write(json.dumps(vocab_tokens))
|
fp.write(json.dumps(vocab_tokens))
|
||||||
with open(self.tgt_vocab_file, "w") as fp:
|
with open(cls.tgt_vocab_file, "w") as fp:
|
||||||
fp.write(json.dumps(vocab_tokens))
|
fp.write(json.dumps(vocab_tokens))
|
||||||
with open(self.merges_file, "w") as fp:
|
with open(cls.merges_file, "w") as fp:
|
||||||
fp.write("\n".join(merges))
|
fp.write("\n".join(merges))
|
||||||
with open(config_file, "w") as fp:
|
with open(config_file, "w") as fp:
|
||||||
fp.write(json.dumps(config))
|
fp.write(json.dumps(config))
|
||||||
|
@ -16,12 +16,13 @@
|
|||||||
|
|
||||||
import os
|
import os
|
||||||
import unittest
|
import unittest
|
||||||
|
from functools import lru_cache
|
||||||
|
|
||||||
from transformers import FunnelTokenizer, FunnelTokenizerFast
|
from transformers import FunnelTokenizer, FunnelTokenizerFast
|
||||||
from transformers.models.funnel.tokenization_funnel import VOCAB_FILES_NAMES
|
from transformers.models.funnel.tokenization_funnel import VOCAB_FILES_NAMES
|
||||||
from transformers.testing_utils import require_tokenizers
|
from transformers.testing_utils import require_tokenizers
|
||||||
|
|
||||||
from ...test_tokenization_common import TokenizerTesterMixin
|
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
|
||||||
|
|
||||||
|
|
||||||
@require_tokenizers
|
@require_tokenizers
|
||||||
@ -32,8 +33,9 @@ class FunnelTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
test_rust_tokenizer = True
|
test_rust_tokenizer = True
|
||||||
space_between_special_tokens = True
|
space_between_special_tokens = True
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
vocab_tokens = [
|
vocab_tokens = [
|
||||||
"<unk>",
|
"<unk>",
|
||||||
@ -50,15 +52,23 @@ class FunnelTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
"low",
|
"low",
|
||||||
"lowest",
|
"lowest",
|
||||||
]
|
]
|
||||||
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
||||||
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
|
with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
|
||||||
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
|
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
|
||||||
|
|
||||||
def get_tokenizer(self, **kwargs):
|
@classmethod
|
||||||
return FunnelTokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
@use_cache_if_possible
|
||||||
|
@lru_cache(maxsize=64)
|
||||||
|
def get_tokenizer(cls, pretrained_name=None, **kwargs):
|
||||||
|
pretrained_name = pretrained_name or cls.tmpdirname
|
||||||
|
return FunnelTokenizer.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
def get_rust_tokenizer(self, **kwargs):
|
@classmethod
|
||||||
return FunnelTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
|
@use_cache_if_possible
|
||||||
|
@lru_cache(maxsize=64)
|
||||||
|
def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
|
||||||
|
pretrained_name = pretrained_name or cls.tmpdirname
|
||||||
|
return FunnelTokenizerFast.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
def get_input_output_texts(self, tokenizer):
|
def get_input_output_texts(self, tokenizer):
|
||||||
input_text = "UNwant\u00e9d,running"
|
input_text = "UNwant\u00e9d,running"
|
||||||
|
@ -53,12 +53,13 @@ class GemmaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
test_sentencepiece = True
|
test_sentencepiece = True
|
||||||
from_pretrained_kwargs = {}
|
from_pretrained_kwargs = {}
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
# We have a SentencePiece fixture for testing
|
# We have a SentencePiece fixture for testing
|
||||||
tokenizer = GemmaTokenizer(SAMPLE_VOCAB, keep_accents=True)
|
tokenizer = GemmaTokenizer(SAMPLE_VOCAB, keep_accents=True)
|
||||||
tokenizer.pad_token = tokenizer.eos_token
|
tokenizer.pad_token = tokenizer.eos_token
|
||||||
tokenizer.save_pretrained(self.tmpdirname)
|
tokenizer.save_pretrained(cls.tmpdirname)
|
||||||
|
|
||||||
@require_torch
|
@require_torch
|
||||||
def test_batch_tokenization(self):
|
def test_batch_tokenization(self):
|
||||||
@ -103,7 +104,7 @@ class GemmaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
added_tokens = [AddedToken("<special>", lstrip=True)]
|
added_tokens = [AddedToken("<special>", lstrip=True)]
|
||||||
|
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
|
tokenizer_r = self.get_rust_tokenizer(
|
||||||
pretrained_name, additional_special_tokens=added_tokens, **kwargs
|
pretrained_name, additional_special_tokens=added_tokens, **kwargs
|
||||||
)
|
)
|
||||||
r_output = tokenizer_r.encode("Hey this is a <special> token")
|
r_output = tokenizer_r.encode("Hey this is a <special> token")
|
||||||
@ -113,7 +114,7 @@ class GemmaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
self.assertTrue(special_token_id in r_output)
|
self.assertTrue(special_token_id in r_output)
|
||||||
|
|
||||||
if self.test_slow_tokenizer:
|
if self.test_slow_tokenizer:
|
||||||
tokenizer_cr = self.rust_tokenizer_class.from_pretrained(
|
tokenizer_cr = self.get_rust_tokenizer(
|
||||||
pretrained_name,
|
pretrained_name,
|
||||||
additional_special_tokens=added_tokens,
|
additional_special_tokens=added_tokens,
|
||||||
**kwargs, # , from_slow=True <- unfortunately too slow to convert
|
**kwargs, # , from_slow=True <- unfortunately too slow to convert
|
||||||
|
@ -17,12 +17,13 @@
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import unittest
|
import unittest
|
||||||
|
from functools import lru_cache
|
||||||
|
|
||||||
from transformers import AutoTokenizer, GPT2Tokenizer, GPT2TokenizerFast
|
from transformers import AutoTokenizer, GPT2Tokenizer, GPT2TokenizerFast
|
||||||
from transformers.models.gpt2.tokenization_gpt2 import VOCAB_FILES_NAMES
|
from transformers.models.gpt2.tokenization_gpt2 import VOCAB_FILES_NAMES
|
||||||
from transformers.testing_utils import require_jinja, require_tiktoken, require_tokenizers
|
from transformers.testing_utils import require_jinja, require_tiktoken, require_tokenizers
|
||||||
|
|
||||||
from ...test_tokenization_common import TokenizerTesterMixin
|
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
|
||||||
|
|
||||||
|
|
||||||
@require_tokenizers
|
@require_tokenizers
|
||||||
@ -34,8 +35,9 @@ class GPT2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
from_pretrained_kwargs = {"add_prefix_space": True}
|
from_pretrained_kwargs = {"add_prefix_space": True}
|
||||||
test_seq2seq = False
|
test_seq2seq = False
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
|
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
|
||||||
vocab = [
|
vocab = [
|
||||||
@ -63,22 +65,30 @@ class GPT2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
]
|
]
|
||||||
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
||||||
merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
|
merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
|
||||||
self.special_tokens_map = {"unk_token": "<unk>"}
|
cls.special_tokens_map = {"unk_token": "<unk>"}
|
||||||
|
|
||||||
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
||||||
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
|
cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
|
||||||
with open(self.vocab_file, "w", encoding="utf-8") as fp:
|
with open(cls.vocab_file, "w", encoding="utf-8") as fp:
|
||||||
fp.write(json.dumps(vocab_tokens) + "\n")
|
fp.write(json.dumps(vocab_tokens) + "\n")
|
||||||
with open(self.merges_file, "w", encoding="utf-8") as fp:
|
with open(cls.merges_file, "w", encoding="utf-8") as fp:
|
||||||
fp.write("\n".join(merges))
|
fp.write("\n".join(merges))
|
||||||
|
|
||||||
def get_tokenizer(self, **kwargs):
|
@classmethod
|
||||||
kwargs.update(self.special_tokens_map)
|
@use_cache_if_possible
|
||||||
return GPT2Tokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
@lru_cache(maxsize=64)
|
||||||
|
def get_tokenizer(cls, pretrained_name=None, **kwargs):
|
||||||
|
kwargs.update(cls.special_tokens_map)
|
||||||
|
pretrained_name = pretrained_name or cls.tmpdirname
|
||||||
|
return GPT2Tokenizer.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
def get_rust_tokenizer(self, **kwargs):
|
@classmethod
|
||||||
kwargs.update(self.special_tokens_map)
|
@use_cache_if_possible
|
||||||
return GPT2TokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
|
@lru_cache(maxsize=64)
|
||||||
|
def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
|
||||||
|
kwargs.update(cls.special_tokens_map)
|
||||||
|
pretrained_name = pretrained_name or cls.tmpdirname
|
||||||
|
return GPT2TokenizerFast.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
def get_input_output_texts(self, tokenizer):
|
def get_input_output_texts(self, tokenizer):
|
||||||
input_text = "lower newer"
|
input_text = "lower newer"
|
||||||
@ -135,7 +145,7 @@ class GPT2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
def test_padding(self, max_length=15):
|
def test_padding(self, max_length=15):
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
# Simple input
|
# Simple input
|
||||||
s = "This is a simple input"
|
s = "This is a simple input"
|
||||||
|
@ -17,6 +17,7 @@
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import unittest
|
import unittest
|
||||||
|
from functools import lru_cache
|
||||||
|
|
||||||
from transformers.models.gpt_neox_japanese.tokenization_gpt_neox_japanese import (
|
from transformers.models.gpt_neox_japanese.tokenization_gpt_neox_japanese import (
|
||||||
VOCAB_FILES_NAMES,
|
VOCAB_FILES_NAMES,
|
||||||
@ -24,7 +25,7 @@ from transformers.models.gpt_neox_japanese.tokenization_gpt_neox_japanese import
|
|||||||
)
|
)
|
||||||
from transformers.testing_utils import require_tokenizers, slow
|
from transformers.testing_utils import require_tokenizers, slow
|
||||||
|
|
||||||
from ...test_tokenization_common import TokenizerTesterMixin
|
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
|
||||||
|
|
||||||
|
|
||||||
@require_tokenizers
|
@require_tokenizers
|
||||||
@ -34,8 +35,9 @@ class GPTNeoXJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
test_rust_tokenizer = False
|
test_rust_tokenizer = False
|
||||||
from_pretrained_kwargs = {"do_clean_text": False, "add_prefix_space": False}
|
from_pretrained_kwargs = {"do_clean_text": False, "add_prefix_space": False}
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
vocab_tokens = [
|
vocab_tokens = [
|
||||||
"こん",
|
"こん",
|
||||||
@ -62,18 +64,22 @@ class GPTNeoXJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
"<|endoftext|>",
|
"<|endoftext|>",
|
||||||
]
|
]
|
||||||
emoji_tokens = {"emoji": {"\ud83d\ude00": "<|emoji1|>"}, "emoji_inv": {"<|emoji1|>": "\ud83d\ude00"}} # 😀
|
emoji_tokens = {"emoji": {"\ud83d\ude00": "<|emoji1|>"}, "emoji_inv": {"<|emoji1|>": "\ud83d\ude00"}} # 😀
|
||||||
self.special_tokens_map = {"unk_token": "<unk>"}
|
cls.special_tokens_map = {"unk_token": "<unk>"}
|
||||||
|
|
||||||
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
||||||
self.emoji_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["emoji_file"])
|
cls.emoji_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["emoji_file"])
|
||||||
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
|
with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
|
||||||
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
|
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
|
||||||
with open(self.emoji_file, "w") as emoji_writer:
|
with open(cls.emoji_file, "w") as emoji_writer:
|
||||||
emoji_writer.write(json.dumps(emoji_tokens))
|
emoji_writer.write(json.dumps(emoji_tokens))
|
||||||
|
|
||||||
def get_tokenizer(self, **kwargs):
|
@classmethod
|
||||||
kwargs.update(self.special_tokens_map)
|
@use_cache_if_possible
|
||||||
return GPTNeoXJapaneseTokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
@lru_cache(maxsize=64)
|
||||||
|
def get_tokenizer(cls, pretrained_name=None, **kwargs):
|
||||||
|
kwargs.update(cls.special_tokens_map)
|
||||||
|
pretrained_name = pretrained_name or cls.tmpdirname
|
||||||
|
return GPTNeoXJapaneseTokenizer.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
def get_input_output_texts(self, tokenizer):
|
def get_input_output_texts(self, tokenizer):
|
||||||
input_text = "こんにちは、世界。 \nこんばんは、㔺界。😀"
|
input_text = "こんにちは、世界。 \nこんばんは、㔺界。😀"
|
||||||
|
@ -33,13 +33,14 @@ class GPTSw3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
test_sentencepiece = True
|
test_sentencepiece = True
|
||||||
test_sentencepiece_ignore_case = False
|
test_sentencepiece_ignore_case = False
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
# We have a SentencePiece fixture for testing
|
# We have a SentencePiece fixture for testing
|
||||||
tokenizer = GPTSw3Tokenizer(SAMPLE_VOCAB, eos_token="<unk>", bos_token="<unk>", pad_token="<unk>")
|
tokenizer = GPTSw3Tokenizer(SAMPLE_VOCAB, eos_token="<unk>", bos_token="<unk>", pad_token="<unk>")
|
||||||
|
|
||||||
tokenizer.save_pretrained(self.tmpdirname)
|
tokenizer.save_pretrained(cls.tmpdirname)
|
||||||
|
|
||||||
def get_input_output_texts(self, tokenizer):
|
def get_input_output_texts(self, tokenizer):
|
||||||
input_text = "This is a test"
|
input_text = "This is a test"
|
||||||
|
@ -33,12 +33,13 @@ class HerbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
rust_tokenizer_class = HerbertTokenizerFast
|
rust_tokenizer_class = HerbertTokenizerFast
|
||||||
test_rust_tokenizer = True
|
test_rust_tokenizer = True
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
# Use a simpler test file without japanese/chinese characters
|
# Use a simpler test file without japanese/chinese characters
|
||||||
with open(f"{get_tests_dir()}/fixtures/sample_text_no_unicode.txt", encoding="utf-8") as f_data:
|
with open(f"{get_tests_dir()}/fixtures/sample_text_no_unicode.txt", encoding="utf-8") as f_data:
|
||||||
self._data = f_data.read().replace("\n\n", "\n").strip()
|
cls._data = f_data.read().replace("\n\n", "\n").strip()
|
||||||
|
|
||||||
vocab = [
|
vocab = [
|
||||||
"<s>",
|
"<s>",
|
||||||
@ -69,11 +70,11 @@ class HerbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
||||||
merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""]
|
merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""]
|
||||||
|
|
||||||
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
||||||
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
|
cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
|
||||||
with open(self.vocab_file, "w") as fp:
|
with open(cls.vocab_file, "w") as fp:
|
||||||
fp.write(json.dumps(vocab_tokens))
|
fp.write(json.dumps(vocab_tokens))
|
||||||
with open(self.merges_file, "w") as fp:
|
with open(cls.merges_file, "w") as fp:
|
||||||
fp.write("\n".join(merges))
|
fp.write("\n".join(merges))
|
||||||
|
|
||||||
def get_input_output_texts(self, tokenizer):
|
def get_input_output_texts(self, tokenizer):
|
||||||
|
@ -16,12 +16,13 @@
|
|||||||
|
|
||||||
import os
|
import os
|
||||||
import unittest
|
import unittest
|
||||||
|
from functools import lru_cache
|
||||||
|
|
||||||
from transformers import LayoutLMTokenizer, LayoutLMTokenizerFast
|
from transformers import LayoutLMTokenizer, LayoutLMTokenizerFast
|
||||||
from transformers.models.layoutlm.tokenization_layoutlm import VOCAB_FILES_NAMES
|
from transformers.models.layoutlm.tokenization_layoutlm import VOCAB_FILES_NAMES
|
||||||
from transformers.testing_utils import require_tokenizers
|
from transformers.testing_utils import require_tokenizers
|
||||||
|
|
||||||
from ...test_tokenization_common import TokenizerTesterMixin
|
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
|
||||||
|
|
||||||
|
|
||||||
@require_tokenizers
|
@require_tokenizers
|
||||||
@ -32,8 +33,9 @@ class LayoutLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
test_rust_tokenizer = True
|
test_rust_tokenizer = True
|
||||||
space_between_special_tokens = True
|
space_between_special_tokens = True
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
vocab_tokens = [
|
vocab_tokens = [
|
||||||
"[UNK]",
|
"[UNK]",
|
||||||
@ -50,12 +52,16 @@ class LayoutLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
"low",
|
"low",
|
||||||
"lowest",
|
"lowest",
|
||||||
]
|
]
|
||||||
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
||||||
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
|
with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
|
||||||
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
|
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
|
||||||
|
|
||||||
def get_tokenizer(self, **kwargs):
|
@classmethod
|
||||||
return LayoutLMTokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
@use_cache_if_possible
|
||||||
|
@lru_cache(maxsize=64)
|
||||||
|
def get_tokenizer(cls, pretrained_name=None, **kwargs):
|
||||||
|
pretrained_name = pretrained_name or cls.tmpdirname
|
||||||
|
return LayoutLMTokenizer.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
def get_input_output_texts(self, tokenizer):
|
def get_input_output_texts(self, tokenizer):
|
||||||
input_text = "UNwant\u00e9d,running"
|
input_text = "UNwant\u00e9d,running"
|
||||||
|
@ -102,8 +102,9 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
|
|
||||||
return questions, words, boxes
|
return questions, words, boxes
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
vocab_tokens = [
|
vocab_tokens = [
|
||||||
"[UNK]",
|
"[UNK]",
|
||||||
@ -122,8 +123,8 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
"test",
|
"test",
|
||||||
"lowest",
|
"lowest",
|
||||||
]
|
]
|
||||||
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
||||||
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
|
with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
|
||||||
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
|
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
|
||||||
|
|
||||||
def get_input_output_texts(self, tokenizer):
|
def get_input_output_texts(self, tokenizer):
|
||||||
@ -267,7 +268,7 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
def test_offsets_with_special_characters(self):
|
def test_offsets_with_special_characters(self):
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
words, boxes = self.get_words_and_boxes()
|
words, boxes = self.get_words_and_boxes()
|
||||||
words[1] = tokenizer_r.mask_token
|
words[1] = tokenizer_r.mask_token
|
||||||
@ -605,8 +606,8 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
def test_padding(self, max_length=50):
|
def test_padding(self, max_length=50):
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
|
self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
|
||||||
pad_token_id = tokenizer_p.pad_token_id
|
pad_token_id = tokenizer_p.pad_token_id
|
||||||
@ -1060,7 +1061,7 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
# Input tokens id
|
# Input tokens id
|
||||||
words, boxes = self.get_words_and_boxes()
|
words, boxes = self.get_words_and_boxes()
|
||||||
@ -1363,7 +1364,7 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
words, boxes = self.get_words_and_boxes()
|
words, boxes = self.get_words_and_boxes()
|
||||||
|
|
||||||
@ -1417,7 +1418,7 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
words, boxes = self.get_words_and_boxes()
|
words, boxes = self.get_words_and_boxes()
|
||||||
tokens_r = tokenizer_r.encode_plus(
|
tokens_r = tokenizer_r.encode_plus(
|
||||||
words,
|
words,
|
||||||
@ -1715,7 +1716,7 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
|
self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
|
||||||
pad_token_id = tokenizer_p.pad_token_id
|
pad_token_id = tokenizer_p.pad_token_id
|
||||||
|
|
||||||
|
@ -20,6 +20,7 @@ import re
|
|||||||
import shutil
|
import shutil
|
||||||
import tempfile
|
import tempfile
|
||||||
import unittest
|
import unittest
|
||||||
|
from functools import lru_cache
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from parameterized import parameterized
|
from parameterized import parameterized
|
||||||
@ -41,7 +42,12 @@ from transformers.testing_utils import (
|
|||||||
slow,
|
slow,
|
||||||
)
|
)
|
||||||
|
|
||||||
from ...test_tokenization_common import SMALL_TRAINING_CORPUS, TokenizerTesterMixin, merge_model_tokenizer_mappings
|
from ...test_tokenization_common import (
|
||||||
|
SMALL_TRAINING_CORPUS,
|
||||||
|
TokenizerTesterMixin,
|
||||||
|
merge_model_tokenizer_mappings,
|
||||||
|
use_cache_if_possible,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
@ -91,8 +97,9 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
|
|
||||||
return questions, words, boxes
|
return questions, words, boxes
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
|
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
|
||||||
vocab = [
|
vocab = [
|
||||||
@ -119,22 +126,30 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
]
|
]
|
||||||
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
||||||
merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
|
merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
|
||||||
self.special_tokens_map = {"unk_token": "<unk>"}
|
cls.special_tokens_map = {"unk_token": "<unk>"}
|
||||||
|
|
||||||
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
||||||
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
|
cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
|
||||||
with open(self.vocab_file, "w", encoding="utf-8") as fp:
|
with open(cls.vocab_file, "w", encoding="utf-8") as fp:
|
||||||
fp.write(json.dumps(vocab_tokens) + "\n")
|
fp.write(json.dumps(vocab_tokens) + "\n")
|
||||||
with open(self.merges_file, "w", encoding="utf-8") as fp:
|
with open(cls.merges_file, "w", encoding="utf-8") as fp:
|
||||||
fp.write("\n".join(merges))
|
fp.write("\n".join(merges))
|
||||||
|
|
||||||
def get_tokenizer(self, **kwargs):
|
@classmethod
|
||||||
kwargs.update(self.special_tokens_map)
|
@use_cache_if_possible
|
||||||
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
|
@lru_cache(maxsize=64)
|
||||||
|
def get_tokenizer(cls, pretrained_name=None, **kwargs):
|
||||||
|
kwargs.update(cls.special_tokens_map)
|
||||||
|
pretrained_name = pretrained_name or cls.tmpdirname
|
||||||
|
return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
def get_rust_tokenizer(self, **kwargs):
|
@classmethod
|
||||||
kwargs.update(self.special_tokens_map)
|
@use_cache_if_possible
|
||||||
return LayoutLMv3TokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
|
@lru_cache(maxsize=64)
|
||||||
|
def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
|
||||||
|
kwargs.update(cls.special_tokens_map)
|
||||||
|
pretrained_name = pretrained_name or cls.tmpdirname
|
||||||
|
return LayoutLMv3TokenizerFast.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
def get_input_output_texts(self, tokenizer):
|
def get_input_output_texts(self, tokenizer):
|
||||||
input_text = "lower newer"
|
input_text = "lower newer"
|
||||||
@ -485,8 +500,8 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
def test_padding(self, max_length=50):
|
def test_padding(self, max_length=50):
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
|
self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
|
||||||
pad_token_id = tokenizer_p.pad_token_id
|
pad_token_id = tokenizer_p.pad_token_id
|
||||||
@ -940,7 +955,7 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
# Input tokens id
|
# Input tokens id
|
||||||
words, boxes = self.get_words_and_boxes()
|
words, boxes = self.get_words_and_boxes()
|
||||||
@ -1241,7 +1256,7 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
words, boxes = self.get_words_and_boxes()
|
words, boxes = self.get_words_and_boxes()
|
||||||
|
|
||||||
@ -1295,7 +1310,7 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
words, boxes = self.get_words_and_boxes()
|
words, boxes = self.get_words_and_boxes()
|
||||||
tokens_r = tokenizer_r.encode_plus(
|
tokens_r = tokenizer_r.encode_plus(
|
||||||
words,
|
words,
|
||||||
@ -1593,7 +1608,7 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
|
self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
|
||||||
pad_token_id = tokenizer_p.pad_token_id
|
pad_token_id = tokenizer_p.pad_token_id
|
||||||
|
|
||||||
|
@ -96,12 +96,13 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
|
|
||||||
return questions, words, boxes
|
return questions, words, boxes
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
# We have a SentencePiece fixture for testing
|
# We have a SentencePiece fixture for testing
|
||||||
tokenizer = LayoutXLMTokenizer(SAMPLE_VOCAB, keep_accents=True)
|
tokenizer = LayoutXLMTokenizer(SAMPLE_VOCAB, keep_accents=True)
|
||||||
tokenizer.save_pretrained(self.tmpdirname)
|
tokenizer.save_pretrained(cls.tmpdirname)
|
||||||
|
|
||||||
def get_input_output_texts(self, tokenizer):
|
def get_input_output_texts(self, tokenizer):
|
||||||
input_text = "UNwant\u00e9d,running"
|
input_text = "UNwant\u00e9d,running"
|
||||||
@ -157,7 +158,7 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
_, _, boxes = self.get_question_words_and_boxes()
|
_, _, boxes = self.get_question_words_and_boxes()
|
||||||
|
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_rust = self.rust_tokenizer_class.from_pretrained(
|
tokenizer_rust = self.get_rust_tokenizer(
|
||||||
pretrained_name, additional_special_tokens=[special_token], split_special_tokens=True, **kwargs
|
pretrained_name, additional_special_tokens=[special_token], split_special_tokens=True, **kwargs
|
||||||
)
|
)
|
||||||
tokenizer_py = self.tokenizer_class.from_pretrained(
|
tokenizer_py = self.tokenizer_class.from_pretrained(
|
||||||
@ -206,7 +207,7 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
def test_offsets_with_special_characters(self):
|
def test_offsets_with_special_characters(self):
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
words, boxes = self.get_words_and_boxes()
|
words, boxes = self.get_words_and_boxes()
|
||||||
words[1] = tokenizer_r.mask_token
|
words[1] = tokenizer_r.mask_token
|
||||||
@ -536,8 +537,8 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
def test_padding(self, max_length=50):
|
def test_padding(self, max_length=50):
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
|
self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
|
||||||
pad_token_id = tokenizer_p.pad_token_id
|
pad_token_id = tokenizer_p.pad_token_id
|
||||||
@ -990,8 +991,8 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
|
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
# Input tokens id
|
# Input tokens id
|
||||||
words, boxes = self.get_words_and_boxes()
|
words, boxes = self.get_words_and_boxes()
|
||||||
@ -1292,7 +1293,7 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
words, boxes = self.get_words_and_boxes()
|
words, boxes = self.get_words_and_boxes()
|
||||||
|
|
||||||
@ -1346,7 +1347,7 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
words, boxes = self.get_words_and_boxes()
|
words, boxes = self.get_words_and_boxes()
|
||||||
tokens_r = tokenizer_r.encode_plus(
|
tokens_r = tokenizer_r.encode_plus(
|
||||||
words,
|
words,
|
||||||
@ -1644,7 +1645,7 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
|
self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
|
||||||
pad_token_id = tokenizer_p.pad_token_id
|
pad_token_id = tokenizer_p.pad_token_id
|
||||||
|
|
||||||
@ -1743,7 +1744,7 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
tmpdirname2 = tempfile.mkdtemp()
|
tmpdirname2 = tempfile.mkdtemp()
|
||||||
|
|
||||||
|
@ -14,13 +14,14 @@
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import unittest
|
import unittest
|
||||||
|
from functools import lru_cache
|
||||||
|
|
||||||
from transformers import BatchEncoding, LEDTokenizer, LEDTokenizerFast
|
from transformers import BatchEncoding, LEDTokenizer, LEDTokenizerFast
|
||||||
from transformers.models.led.tokenization_led import VOCAB_FILES_NAMES
|
from transformers.models.led.tokenization_led import VOCAB_FILES_NAMES
|
||||||
from transformers.testing_utils import require_tokenizers, require_torch
|
from transformers.testing_utils import require_tokenizers, require_torch
|
||||||
from transformers.utils import cached_property
|
from transformers.utils import cached_property
|
||||||
|
|
||||||
from ...test_tokenization_common import TokenizerTesterMixin
|
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
|
||||||
|
|
||||||
|
|
||||||
@require_tokenizers
|
@require_tokenizers
|
||||||
@ -30,8 +31,10 @@ class TestTokenizationLED(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
rust_tokenizer_class = LEDTokenizerFast
|
rust_tokenizer_class = LEDTokenizerFast
|
||||||
test_rust_tokenizer = True
|
test_rust_tokenizer = True
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
vocab = [
|
vocab = [
|
||||||
"l",
|
"l",
|
||||||
"o",
|
"o",
|
||||||
@ -56,22 +59,30 @@ class TestTokenizationLED(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
]
|
]
|
||||||
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
||||||
merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
|
merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
|
||||||
self.special_tokens_map = {"unk_token": "<unk>"}
|
cls.special_tokens_map = {"unk_token": "<unk>"}
|
||||||
|
|
||||||
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
||||||
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
|
cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
|
||||||
with open(self.vocab_file, "w", encoding="utf-8") as fp:
|
with open(cls.vocab_file, "w", encoding="utf-8") as fp:
|
||||||
fp.write(json.dumps(vocab_tokens) + "\n")
|
fp.write(json.dumps(vocab_tokens) + "\n")
|
||||||
with open(self.merges_file, "w", encoding="utf-8") as fp:
|
with open(cls.merges_file, "w", encoding="utf-8") as fp:
|
||||||
fp.write("\n".join(merges))
|
fp.write("\n".join(merges))
|
||||||
|
|
||||||
def get_tokenizer(self, **kwargs):
|
@classmethod
|
||||||
kwargs.update(self.special_tokens_map)
|
@use_cache_if_possible
|
||||||
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
|
@lru_cache(maxsize=64)
|
||||||
|
def get_tokenizer(cls, pretrained_name=None, **kwargs):
|
||||||
|
kwargs.update(cls.special_tokens_map)
|
||||||
|
pretrained_name = pretrained_name or cls.tmpdirname
|
||||||
|
return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
def get_rust_tokenizer(self, **kwargs):
|
@classmethod
|
||||||
kwargs.update(self.special_tokens_map)
|
@use_cache_if_possible
|
||||||
return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
|
@lru_cache(maxsize=64)
|
||||||
|
def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
|
||||||
|
kwargs.update(cls.special_tokens_map)
|
||||||
|
pretrained_name = pretrained_name or cls.tmpdirname
|
||||||
|
return cls.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
def get_input_output_texts(self, tokenizer):
|
def get_input_output_texts(self, tokenizer):
|
||||||
return "lower newer", "lower newer"
|
return "lower newer", "lower newer"
|
||||||
@ -161,8 +172,8 @@ class TestTokenizationLED(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
def test_embeded_special_tokens(self):
|
def test_embeded_special_tokens(self):
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
sentence = "A, <mask> AllenNLP sentence."
|
sentence = "A, <mask> AllenNLP sentence."
|
||||||
tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
|
tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
|
||||||
tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
|
tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
|
||||||
|
@ -60,13 +60,14 @@ class LlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
test_sentencepiece = True
|
test_sentencepiece = True
|
||||||
from_pretrained_kwargs = {}
|
from_pretrained_kwargs = {}
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
# We have a SentencePiece fixture for testing
|
# We have a SentencePiece fixture for testing
|
||||||
tokenizer = LlamaTokenizer(SAMPLE_VOCAB, keep_accents=True)
|
tokenizer = LlamaTokenizer(SAMPLE_VOCAB, keep_accents=True)
|
||||||
tokenizer.pad_token = tokenizer.eos_token
|
tokenizer.pad_token = tokenizer.eos_token
|
||||||
tokenizer.save_pretrained(self.tmpdirname)
|
tokenizer.save_pretrained(cls.tmpdirname)
|
||||||
|
|
||||||
def get_tokenizers(self, **kwargs):
|
def get_tokenizers(self, **kwargs):
|
||||||
kwargs.update({"pad_token": "<PAD>"})
|
kwargs.update({"pad_token": "<PAD>"})
|
||||||
@ -149,8 +150,8 @@ class LlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
self.tokenizers_list += (self.rust_tokenizer_class, "hf-internal-testing/llama-tokenizer", {})
|
self.tokenizers_list += (self.rust_tokenizer_class, "hf-internal-testing/llama-tokenizer", {})
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
tmpdirname2 = tempfile.mkdtemp()
|
tmpdirname2 = tempfile.mkdtemp()
|
||||||
|
|
||||||
@ -253,7 +254,7 @@ class LlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
added_tokens = [AddedToken("<special>", lstrip=True)]
|
added_tokens = [AddedToken("<special>", lstrip=True)]
|
||||||
|
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
|
tokenizer_r = self.get_rust_tokenizer(
|
||||||
pretrained_name, additional_special_tokens=added_tokens, **kwargs
|
pretrained_name, additional_special_tokens=added_tokens, **kwargs
|
||||||
)
|
)
|
||||||
r_output = tokenizer_r.encode("Hey this is a <special> token")
|
r_output = tokenizer_r.encode("Hey this is a <special> token")
|
||||||
@ -263,7 +264,7 @@ class LlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
self.assertTrue(special_token_id in r_output)
|
self.assertTrue(special_token_id in r_output)
|
||||||
|
|
||||||
if self.test_slow_tokenizer:
|
if self.test_slow_tokenizer:
|
||||||
tokenizer_cr = self.rust_tokenizer_class.from_pretrained(
|
tokenizer_cr = self.get_rust_tokenizer(
|
||||||
pretrained_name,
|
pretrained_name,
|
||||||
additional_special_tokens=added_tokens,
|
additional_special_tokens=added_tokens,
|
||||||
**kwargs, # , from_slow=True <- unfortunately too slow to convert
|
**kwargs, # , from_slow=True <- unfortunately too slow to convert
|
||||||
@ -313,8 +314,8 @@ class LlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
EXPECTED_WITH_SPACE = [1, 18637, 920, 526, 366, 2599]
|
EXPECTED_WITH_SPACE = [1, 18637, 920, 526, 366, 2599]
|
||||||
EXPECTED_WO_SPACE = [1, 29950, 1032, 920, 526, 366, 2599]
|
EXPECTED_WO_SPACE = [1, 29950, 1032, 920, 526, 366, 2599]
|
||||||
|
|
||||||
slow_ = self.tokenizer_class.from_pretrained(pretrained_name, add_prefix_space=False, legacy=False)
|
slow_ = self.get_tokenizer(pretrained_name, add_prefix_space=False, legacy=False)
|
||||||
fast_ = self.rust_tokenizer_class.from_pretrained(pretrained_name, add_prefix_space=False, legacy=False)
|
fast_ = self.get_rust_tokenizer(pretrained_name, add_prefix_space=False, legacy=False)
|
||||||
self.assertEqual(slow_.encode(inputs), EXPECTED_WO_SPACE)
|
self.assertEqual(slow_.encode(inputs), EXPECTED_WO_SPACE)
|
||||||
self.assertEqual(slow_.encode(inputs), fast_.encode(inputs))
|
self.assertEqual(slow_.encode(inputs), fast_.encode(inputs))
|
||||||
self.assertEqual(slow_.tokenize(inputs), ["H", "ey", "▁how", "▁are", "▁you", "▁doing"])
|
self.assertEqual(slow_.tokenize(inputs), ["H", "ey", "▁how", "▁are", "▁you", "▁doing"])
|
||||||
@ -324,8 +325,8 @@ class LlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
fast_.decode(EXPECTED_WO_SPACE, skip_special_tokens=True),
|
fast_.decode(EXPECTED_WO_SPACE, skip_special_tokens=True),
|
||||||
)
|
)
|
||||||
|
|
||||||
slow_ = self.tokenizer_class.from_pretrained(pretrained_name, add_prefix_space=True, legacy=False)
|
slow_ = self.get_tokenizer(pretrained_name, add_prefix_space=True, legacy=False)
|
||||||
fast_ = self.rust_tokenizer_class.from_pretrained(pretrained_name, add_prefix_space=True, legacy=False)
|
fast_ = self.get_rust_tokenizer(pretrained_name, add_prefix_space=True, legacy=False)
|
||||||
self.assertEqual(slow_.encode(inputs), EXPECTED_WITH_SPACE)
|
self.assertEqual(slow_.encode(inputs), EXPECTED_WITH_SPACE)
|
||||||
self.assertEqual(slow_.encode(inputs), fast_.encode(inputs))
|
self.assertEqual(slow_.encode(inputs), fast_.encode(inputs))
|
||||||
self.assertEqual(slow_.tokenize(inputs), ["▁Hey", "▁how", "▁are", "▁you", "▁doing"])
|
self.assertEqual(slow_.tokenize(inputs), ["▁Hey", "▁how", "▁are", "▁you", "▁doing"])
|
||||||
|
@ -18,12 +18,13 @@ import itertools
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import unittest
|
import unittest
|
||||||
|
from functools import lru_cache
|
||||||
|
|
||||||
from transformers import AddedToken, LongformerTokenizer, LongformerTokenizerFast
|
from transformers import AddedToken, LongformerTokenizer, LongformerTokenizerFast
|
||||||
from transformers.models.longformer.tokenization_longformer import VOCAB_FILES_NAMES
|
from transformers.models.longformer.tokenization_longformer import VOCAB_FILES_NAMES
|
||||||
from transformers.testing_utils import require_tokenizers, slow
|
from transformers.testing_utils import require_tokenizers, slow
|
||||||
|
|
||||||
from ...test_tokenization_common import TokenizerTesterMixin
|
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
|
||||||
|
|
||||||
|
|
||||||
@require_tokenizers
|
@require_tokenizers
|
||||||
@ -36,8 +37,9 @@ class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
rust_tokenizer_class = LongformerTokenizerFast
|
rust_tokenizer_class = LongformerTokenizerFast
|
||||||
test_rust_tokenizer = True
|
test_rust_tokenizer = True
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
|
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
|
||||||
vocab = [
|
vocab = [
|
||||||
@ -64,22 +66,30 @@ class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
]
|
]
|
||||||
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
||||||
merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
|
merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
|
||||||
self.special_tokens_map = {"unk_token": "<unk>"}
|
cls.special_tokens_map = {"unk_token": "<unk>"}
|
||||||
|
|
||||||
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
||||||
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
|
cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
|
||||||
with open(self.vocab_file, "w", encoding="utf-8") as fp:
|
with open(cls.vocab_file, "w", encoding="utf-8") as fp:
|
||||||
fp.write(json.dumps(vocab_tokens) + "\n")
|
fp.write(json.dumps(vocab_tokens) + "\n")
|
||||||
with open(self.merges_file, "w", encoding="utf-8") as fp:
|
with open(cls.merges_file, "w", encoding="utf-8") as fp:
|
||||||
fp.write("\n".join(merges))
|
fp.write("\n".join(merges))
|
||||||
|
|
||||||
def get_tokenizer(self, **kwargs):
|
@classmethod
|
||||||
kwargs.update(self.special_tokens_map)
|
@use_cache_if_possible
|
||||||
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
|
@lru_cache(maxsize=64)
|
||||||
|
def get_tokenizer(cls, pretrained_name=None, **kwargs):
|
||||||
|
kwargs.update(cls.special_tokens_map)
|
||||||
|
pretrained_name = pretrained_name or cls.tmpdirname
|
||||||
|
return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
def get_rust_tokenizer(self, **kwargs):
|
@classmethod
|
||||||
kwargs.update(self.special_tokens_map)
|
@use_cache_if_possible
|
||||||
return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
|
@lru_cache(maxsize=64)
|
||||||
|
def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
|
||||||
|
kwargs.update(cls.special_tokens_map)
|
||||||
|
pretrained_name = pretrained_name or cls.tmpdirname
|
||||||
|
return cls.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
def get_input_output_texts(self, tokenizer):
|
def get_input_output_texts(self, tokenizer):
|
||||||
input_text = "lower newer"
|
input_text = "lower newer"
|
||||||
@ -173,8 +183,8 @@ class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
def test_embeded_special_tokens(self):
|
def test_embeded_special_tokens(self):
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
sentence = "A, <mask> AllenNLP sentence."
|
sentence = "A, <mask> AllenNLP sentence."
|
||||||
tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
|
tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
|
||||||
tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
|
tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
|
||||||
@ -204,7 +214,7 @@ class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
|
|
||||||
def test_change_add_prefix_space_and_trim_offsets_args(self):
|
def test_change_add_prefix_space_and_trim_offsets_args(self):
|
||||||
for trim_offsets, add_prefix_space in itertools.product([True, False], repeat=2):
|
for trim_offsets, add_prefix_space in itertools.product([True, False], repeat=2):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
|
tokenizer_r = self.get_rust_tokenizer(
|
||||||
self.tmpdirname, use_fast=True, add_prefix_space=add_prefix_space, trim_offsets=trim_offsets
|
self.tmpdirname, use_fast=True, add_prefix_space=add_prefix_space, trim_offsets=trim_offsets
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -224,7 +234,7 @@ class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
text_of_1_token = "hello" # `hello` is a token in the vocabulary of `pretrained_name`
|
text_of_1_token = "hello" # `hello` is a token in the vocabulary of `pretrained_name`
|
||||||
text = f"{text_of_1_token} {text_of_1_token}"
|
text = f"{text_of_1_token} {text_of_1_token}"
|
||||||
|
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
|
tokenizer_r = self.get_rust_tokenizer(
|
||||||
pretrained_name, use_fast=True, add_prefix_space=True, trim_offsets=True
|
pretrained_name, use_fast=True, add_prefix_space=True, trim_offsets=True
|
||||||
)
|
)
|
||||||
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
|
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
|
||||||
@ -234,7 +244,7 @@ class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
(len(text_of_1_token) + 1, len(text_of_1_token) + 1 + len(text_of_1_token)),
|
(len(text_of_1_token) + 1, len(text_of_1_token) + 1 + len(text_of_1_token)),
|
||||||
)
|
)
|
||||||
|
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
|
tokenizer_r = self.get_rust_tokenizer(
|
||||||
pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=True
|
pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=True
|
||||||
)
|
)
|
||||||
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
|
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
|
||||||
@ -244,7 +254,7 @@ class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
(len(text_of_1_token) + 1, len(text_of_1_token) + 1 + len(text_of_1_token)),
|
(len(text_of_1_token) + 1, len(text_of_1_token) + 1 + len(text_of_1_token)),
|
||||||
)
|
)
|
||||||
|
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
|
tokenizer_r = self.get_rust_tokenizer(
|
||||||
pretrained_name, use_fast=True, add_prefix_space=True, trim_offsets=False
|
pretrained_name, use_fast=True, add_prefix_space=True, trim_offsets=False
|
||||||
)
|
)
|
||||||
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
|
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
|
||||||
@ -254,7 +264,7 @@ class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
(len(text_of_1_token), len(text_of_1_token) + 1 + len(text_of_1_token)),
|
(len(text_of_1_token), len(text_of_1_token) + 1 + len(text_of_1_token)),
|
||||||
)
|
)
|
||||||
|
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
|
tokenizer_r = self.get_rust_tokenizer(
|
||||||
pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=False
|
pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=False
|
||||||
)
|
)
|
||||||
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
|
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
|
||||||
@ -276,7 +286,7 @@ class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
# (1 + len(text_of_1_token) + 1, 1 + len(text_of_1_token) + 1 + len(text_of_1_token)),
|
# (1 + len(text_of_1_token) + 1, 1 + len(text_of_1_token) + 1 + len(text_of_1_token)),
|
||||||
# )
|
# )
|
||||||
|
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
|
tokenizer_r = self.get_rust_tokenizer(
|
||||||
pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=True
|
pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=True
|
||||||
)
|
)
|
||||||
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
|
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
|
||||||
@ -286,7 +296,7 @@ class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
(1 + len(text_of_1_token) + 1, 1 + len(text_of_1_token) + 1 + len(text_of_1_token)),
|
(1 + len(text_of_1_token) + 1, 1 + len(text_of_1_token) + 1 + len(text_of_1_token)),
|
||||||
)
|
)
|
||||||
|
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
|
tokenizer_r = self.get_rust_tokenizer(
|
||||||
pretrained_name, use_fast=True, add_prefix_space=True, trim_offsets=False
|
pretrained_name, use_fast=True, add_prefix_space=True, trim_offsets=False
|
||||||
)
|
)
|
||||||
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
|
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
|
||||||
@ -296,7 +306,7 @@ class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
(1 + len(text_of_1_token), 1 + len(text_of_1_token) + 1 + len(text_of_1_token)),
|
(1 + len(text_of_1_token), 1 + len(text_of_1_token) + 1 + len(text_of_1_token)),
|
||||||
)
|
)
|
||||||
|
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
|
tokenizer_r = self.get_rust_tokenizer(
|
||||||
pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=False
|
pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=False
|
||||||
)
|
)
|
||||||
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
|
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
|
||||||
|
@ -14,12 +14,13 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
import unittest
|
import unittest
|
||||||
|
from functools import lru_cache
|
||||||
from typing import Tuple
|
from typing import Tuple
|
||||||
|
|
||||||
from transformers import AddedToken, LukeTokenizer
|
from transformers import AddedToken, LukeTokenizer
|
||||||
from transformers.testing_utils import get_tests_dir, require_torch, slow
|
from transformers.testing_utils import get_tests_dir, require_torch, slow
|
||||||
|
|
||||||
from ...test_tokenization_common import TokenizerTesterMixin
|
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
|
||||||
|
|
||||||
|
|
||||||
SAMPLE_VOCAB = get_tests_dir("fixtures/vocab.json")
|
SAMPLE_VOCAB = get_tests_dir("fixtures/vocab.json")
|
||||||
@ -33,13 +34,17 @@ class LukeTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
test_rust_tokenizer = False
|
test_rust_tokenizer = False
|
||||||
from_pretrained_kwargs = {"cls_token": "<s>"}
|
from_pretrained_kwargs = {"cls_token": "<s>"}
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
self.special_tokens_map = {"entity_token_1": "<ent>", "entity_token_2": "<ent2>"}
|
cls.special_tokens_map = {"entity_token_1": "<ent>", "entity_token_2": "<ent2>"}
|
||||||
|
|
||||||
def get_tokenizer(self, task=None, **kwargs):
|
@classmethod
|
||||||
kwargs.update(self.special_tokens_map)
|
@use_cache_if_possible
|
||||||
|
@lru_cache(maxsize=64)
|
||||||
|
def get_tokenizer(cls, task=None, **kwargs):
|
||||||
|
kwargs.update(cls.special_tokens_map)
|
||||||
tokenizer = LukeTokenizer(
|
tokenizer = LukeTokenizer(
|
||||||
vocab_file=SAMPLE_VOCAB,
|
vocab_file=SAMPLE_VOCAB,
|
||||||
merges_file=SAMPLE_MERGE_FILE,
|
merges_file=SAMPLE_MERGE_FILE,
|
||||||
@ -137,8 +142,8 @@ class LukeTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
def test_embeded_special_tokens(self):
|
def test_embeded_special_tokens(self):
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)):
|
with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
sentence = "A, <mask> AllenNLP sentence."
|
sentence = "A, <mask> AllenNLP sentence."
|
||||||
tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
|
tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
|
||||||
tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
|
tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
|
||||||
|
@ -32,8 +32,9 @@ class LxmertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
test_rust_tokenizer = True
|
test_rust_tokenizer = True
|
||||||
space_between_special_tokens = True
|
space_between_special_tokens = True
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
vocab_tokens = [
|
vocab_tokens = [
|
||||||
"[UNK]",
|
"[UNK]",
|
||||||
@ -50,8 +51,8 @@ class LxmertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
"low",
|
"low",
|
||||||
"lowest",
|
"lowest",
|
||||||
]
|
]
|
||||||
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
||||||
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
|
with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
|
||||||
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
|
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
|
||||||
|
|
||||||
def get_input_output_texts(self, tokenizer):
|
def get_input_output_texts(self, tokenizer):
|
||||||
|
@ -14,6 +14,7 @@
|
|||||||
|
|
||||||
import tempfile
|
import tempfile
|
||||||
import unittest
|
import unittest
|
||||||
|
from functools import lru_cache
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from shutil import copyfile
|
from shutil import copyfile
|
||||||
|
|
||||||
@ -32,7 +33,7 @@ from transformers.utils import is_sentencepiece_available
|
|||||||
if is_sentencepiece_available():
|
if is_sentencepiece_available():
|
||||||
from transformers.models.m2m_100.tokenization_m2m_100 import VOCAB_FILES_NAMES, save_json
|
from transformers.models.m2m_100.tokenization_m2m_100 import VOCAB_FILES_NAMES, save_json
|
||||||
|
|
||||||
from ...test_tokenization_common import TokenizerTesterMixin
|
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
|
||||||
|
|
||||||
|
|
||||||
if is_sentencepiece_available():
|
if is_sentencepiece_available():
|
||||||
@ -54,21 +55,26 @@ class M2M100TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
test_seq2seq = False
|
test_seq2seq = False
|
||||||
test_sentencepiece = True
|
test_sentencepiece = True
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
vocab = ["</s>", "<unk>", "▁This", "▁is", "▁a", "▁t", "est", "\u0120", "<pad>"]
|
vocab = ["</s>", "<unk>", "▁This", "▁is", "▁a", "▁t", "est", "\u0120", "<pad>"]
|
||||||
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
||||||
save_dir = Path(self.tmpdirname)
|
save_dir = Path(cls.tmpdirname)
|
||||||
save_json(vocab_tokens, save_dir / VOCAB_FILES_NAMES["vocab_file"])
|
save_json(vocab_tokens, save_dir / VOCAB_FILES_NAMES["vocab_file"])
|
||||||
if not (save_dir / VOCAB_FILES_NAMES["spm_file"]).exists():
|
if not (save_dir / VOCAB_FILES_NAMES["spm_file"]).exists():
|
||||||
copyfile(SAMPLE_SP, save_dir / VOCAB_FILES_NAMES["spm_file"])
|
copyfile(SAMPLE_SP, save_dir / VOCAB_FILES_NAMES["spm_file"])
|
||||||
|
|
||||||
tokenizer = M2M100Tokenizer.from_pretrained(self.tmpdirname)
|
tokenizer = M2M100Tokenizer.from_pretrained(cls.tmpdirname)
|
||||||
tokenizer.save_pretrained(self.tmpdirname)
|
tokenizer.save_pretrained(cls.tmpdirname)
|
||||||
|
|
||||||
def get_tokenizer(self, **kwargs):
|
@classmethod
|
||||||
return M2M100Tokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
@use_cache_if_possible
|
||||||
|
@lru_cache(maxsize=64)
|
||||||
|
def get_tokenizer(cls, pretrained_name=None, **kwargs):
|
||||||
|
pretrained_name = pretrained_name or cls.tmpdirname
|
||||||
|
return M2M100Tokenizer.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
def get_input_output_texts(self, tokenizer):
|
def get_input_output_texts(self, tokenizer):
|
||||||
return (
|
return (
|
||||||
|
@ -15,6 +15,7 @@
|
|||||||
|
|
||||||
import tempfile
|
import tempfile
|
||||||
import unittest
|
import unittest
|
||||||
|
from functools import lru_cache
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from shutil import copyfile
|
from shutil import copyfile
|
||||||
|
|
||||||
@ -26,7 +27,7 @@ from transformers.utils import is_sentencepiece_available, is_tf_available, is_t
|
|||||||
if is_sentencepiece_available():
|
if is_sentencepiece_available():
|
||||||
from transformers.models.marian.tokenization_marian import VOCAB_FILES_NAMES, save_json
|
from transformers.models.marian.tokenization_marian import VOCAB_FILES_NAMES, save_json
|
||||||
|
|
||||||
from ...test_tokenization_common import TokenizerTesterMixin
|
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
|
||||||
|
|
||||||
|
|
||||||
SAMPLE_SP = get_tests_dir("fixtures/test_sentencepiece.model")
|
SAMPLE_SP = get_tests_dir("fixtures/test_sentencepiece.model")
|
||||||
@ -50,22 +51,28 @@ class MarianTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
test_rust_tokenizer = False
|
test_rust_tokenizer = False
|
||||||
test_sentencepiece = True
|
test_sentencepiece = True
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
vocab = ["</s>", "<unk>", "▁This", "▁is", "▁a", "▁t", "est", "\u0120", "<pad>"]
|
vocab = ["</s>", "<unk>", "▁This", "▁is", "▁a", "▁t", "est", "\u0120", "<pad>"]
|
||||||
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
||||||
save_dir = Path(self.tmpdirname)
|
save_dir = Path(cls.tmpdirname)
|
||||||
save_json(vocab_tokens, save_dir / VOCAB_FILES_NAMES["vocab"])
|
save_json(vocab_tokens, save_dir / VOCAB_FILES_NAMES["vocab"])
|
||||||
save_json(mock_tokenizer_config, save_dir / VOCAB_FILES_NAMES["tokenizer_config_file"])
|
save_json(mock_tokenizer_config, save_dir / VOCAB_FILES_NAMES["tokenizer_config_file"])
|
||||||
if not (save_dir / VOCAB_FILES_NAMES["source_spm"]).exists():
|
if not (save_dir / VOCAB_FILES_NAMES["source_spm"]).exists():
|
||||||
copyfile(SAMPLE_SP, save_dir / VOCAB_FILES_NAMES["source_spm"])
|
copyfile(SAMPLE_SP, save_dir / VOCAB_FILES_NAMES["source_spm"])
|
||||||
copyfile(SAMPLE_SP, save_dir / VOCAB_FILES_NAMES["target_spm"])
|
copyfile(SAMPLE_SP, save_dir / VOCAB_FILES_NAMES["target_spm"])
|
||||||
|
|
||||||
tokenizer = MarianTokenizer.from_pretrained(self.tmpdirname)
|
tokenizer = MarianTokenizer.from_pretrained(cls.tmpdirname)
|
||||||
tokenizer.save_pretrained(self.tmpdirname)
|
tokenizer.save_pretrained(cls.tmpdirname)
|
||||||
|
|
||||||
def get_tokenizer(self, **kwargs) -> MarianTokenizer:
|
@classmethod
|
||||||
return MarianTokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
@use_cache_if_possible
|
||||||
|
@lru_cache(maxsize=64)
|
||||||
|
def get_tokenizer(cls, pretrained_name=None, **kwargs) -> MarianTokenizer:
|
||||||
|
pretrained_name = pretrained_name or cls.tmpdirname
|
||||||
|
return MarianTokenizer.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
def get_input_output_texts(self, tokenizer):
|
def get_input_output_texts(self, tokenizer):
|
||||||
return (
|
return (
|
||||||
|
@ -50,26 +50,27 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
from_pretrained_kwargs = {"cls_token": "<s>"}
|
from_pretrained_kwargs = {"cls_token": "<s>"}
|
||||||
test_seq2seq = False
|
test_seq2seq = False
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
|
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
|
||||||
vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "\u0120", "\u0120l", "\u0120n", "\u0120lo", "\u0120low", "er", "\u0120lowest", "\u0120newer", "\u0120wider", "\u0120hello", "\u0120world", "<unk>",] # fmt: skip
|
vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "\u0120", "\u0120l", "\u0120n", "\u0120lo", "\u0120low", "er", "\u0120lowest", "\u0120newer", "\u0120wider", "\u0120hello", "\u0120world", "<unk>",] # fmt: skip
|
||||||
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
||||||
merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
|
merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
|
||||||
self.tags_dict = {"a": 0, "abbr": 1, "acronym": 2, "address": 3}
|
cls.tags_dict = {"a": 0, "abbr": 1, "acronym": 2, "address": 3}
|
||||||
self.special_tokens_map = {"unk_token": "<unk>"}
|
cls.special_tokens_map = {"unk_token": "<unk>"}
|
||||||
|
|
||||||
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
||||||
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
|
cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
|
||||||
self.tokenizer_config_file = os.path.join(self.tmpdirname, "tokenizer_config.json")
|
cls.tokenizer_config_file = os.path.join(cls.tmpdirname, "tokenizer_config.json")
|
||||||
|
|
||||||
with open(self.vocab_file, "w", encoding="utf-8") as fp:
|
with open(cls.vocab_file, "w", encoding="utf-8") as fp:
|
||||||
fp.write(json.dumps(vocab_tokens) + "\n")
|
fp.write(json.dumps(vocab_tokens) + "\n")
|
||||||
with open(self.merges_file, "w", encoding="utf-8") as fp:
|
with open(cls.merges_file, "w", encoding="utf-8") as fp:
|
||||||
fp.write("\n".join(merges))
|
fp.write("\n".join(merges))
|
||||||
with open(self.tokenizer_config_file, "w", encoding="utf-8") as fp:
|
with open(cls.tokenizer_config_file, "w", encoding="utf-8") as fp:
|
||||||
fp.write(json.dumps({"tags_dict": self.tags_dict}))
|
fp.write(json.dumps({"tags_dict": cls.tags_dict}))
|
||||||
|
|
||||||
def get_nodes_and_xpaths(self):
|
def get_nodes_and_xpaths(self):
|
||||||
nodes = ["hello", "world"]
|
nodes = ["hello", "world"]
|
||||||
@ -421,8 +422,8 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
def test_padding(self, max_length=50):
|
def test_padding(self, max_length=50):
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
|
self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
|
||||||
pad_token_id = tokenizer_p.pad_token_id
|
pad_token_id = tokenizer_p.pad_token_id
|
||||||
@ -828,8 +829,8 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
|
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
# Input tokens id
|
# Input tokens id
|
||||||
nodes, xpaths = self.get_nodes_and_xpaths()
|
nodes, xpaths = self.get_nodes_and_xpaths()
|
||||||
@ -1010,7 +1011,7 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
def test_offsets_mapping(self):
|
def test_offsets_mapping(self):
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
text = ["a", "wonderful", "test"]
|
text = ["a", "wonderful", "test"]
|
||||||
xpaths = ["html/body" for _ in range(len(text))]
|
xpaths = ["html/body" for _ in range(len(text))]
|
||||||
@ -1125,7 +1126,7 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
nodes, xpaths = self.get_nodes_and_xpaths()
|
nodes, xpaths = self.get_nodes_and_xpaths()
|
||||||
|
|
||||||
@ -1187,7 +1188,7 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
nodes, xpaths = self.get_nodes_and_xpaths()
|
nodes, xpaths = self.get_nodes_and_xpaths()
|
||||||
tokens_r = tokenizer_r.encode_plus(nodes, xpaths=xpaths, add_special_tokens=True)
|
tokens_r = tokenizer_r.encode_plus(nodes, xpaths=xpaths, add_special_tokens=True)
|
||||||
tokens_p = tokenizer_p.encode_plus(nodes, xpaths=xpaths, add_special_tokens=True)
|
tokens_p = tokenizer_p.encode_plus(nodes, xpaths=xpaths, add_special_tokens=True)
|
||||||
@ -1490,7 +1491,7 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
|
self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
|
||||||
pad_token_id = tokenizer_p.pad_token_id
|
pad_token_id = tokenizer_p.pad_token_id
|
||||||
|
|
||||||
|
@ -47,12 +47,13 @@ class MBartTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
test_rust_tokenizer = True
|
test_rust_tokenizer = True
|
||||||
test_sentencepiece = True
|
test_sentencepiece = True
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
# We have a SentencePiece fixture for testing
|
# We have a SentencePiece fixture for testing
|
||||||
tokenizer = MBartTokenizer(SAMPLE_VOCAB, keep_accents=True)
|
tokenizer = MBartTokenizer(SAMPLE_VOCAB, keep_accents=True)
|
||||||
tokenizer.save_pretrained(self.tmpdirname)
|
tokenizer.save_pretrained(cls.tmpdirname)
|
||||||
|
|
||||||
def test_full_tokenizer(self):
|
def test_full_tokenizer(self):
|
||||||
tokenizer = MBartTokenizer(SAMPLE_VOCAB, keep_accents=True)
|
tokenizer = MBartTokenizer(SAMPLE_VOCAB, keep_accents=True)
|
||||||
@ -139,8 +140,8 @@ class MBartTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
self.tokenizers_list[0] = (self.rust_tokenizer_class, "hf-internal-testing/tiny-random-mbart", {})
|
self.tokenizers_list[0] = (self.rust_tokenizer_class, "hf-internal-testing/tiny-random-mbart", {})
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
tmpdirname2 = tempfile.mkdtemp()
|
tmpdirname2 = tempfile.mkdtemp()
|
||||||
|
|
||||||
|
@ -47,12 +47,13 @@ class MBart50TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
test_rust_tokenizer = True
|
test_rust_tokenizer = True
|
||||||
test_sentencepiece = True
|
test_sentencepiece = True
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
# We have a SentencePiece fixture for testing
|
# We have a SentencePiece fixture for testing
|
||||||
tokenizer = MBart50Tokenizer(SAMPLE_VOCAB, src_lang="en_XX", tgt_lang="ro_RO", keep_accents=True)
|
tokenizer = MBart50Tokenizer(SAMPLE_VOCAB, src_lang="en_XX", tgt_lang="ro_RO", keep_accents=True)
|
||||||
tokenizer.save_pretrained(self.tmpdirname)
|
tokenizer.save_pretrained(cls.tmpdirname)
|
||||||
|
|
||||||
def test_convert_token_and_id(self):
|
def test_convert_token_and_id(self):
|
||||||
"""Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
|
"""Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
|
||||||
@ -117,8 +118,8 @@ class MBart50TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
self.tokenizers_list[0] = (self.rust_tokenizer_class, "hf-internal-testing/tiny-random-mbart50", {})
|
self.tokenizers_list[0] = (self.rust_tokenizer_class, "hf-internal-testing/tiny-random-mbart50", {})
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
tmpdirname2 = tempfile.mkdtemp()
|
tmpdirname2 = tempfile.mkdtemp()
|
||||||
|
|
||||||
|
@ -17,12 +17,13 @@
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import unittest
|
import unittest
|
||||||
|
from functools import lru_cache
|
||||||
|
|
||||||
from transformers import MgpstrTokenizer
|
from transformers import MgpstrTokenizer
|
||||||
from transformers.models.mgp_str.tokenization_mgp_str import VOCAB_FILES_NAMES
|
from transformers.models.mgp_str.tokenization_mgp_str import VOCAB_FILES_NAMES
|
||||||
from transformers.testing_utils import require_tokenizers
|
from transformers.testing_utils import require_tokenizers
|
||||||
|
|
||||||
from ...test_tokenization_common import TokenizerTesterMixin
|
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
|
||||||
|
|
||||||
|
|
||||||
@require_tokenizers
|
@require_tokenizers
|
||||||
@ -33,18 +34,23 @@ class MgpstrTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
from_pretrained_kwargs = {}
|
from_pretrained_kwargs = {}
|
||||||
test_seq2seq = False
|
test_seq2seq = False
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
vocab = ['[GO]', '[s]', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'] # fmt: skip
|
vocab = ['[GO]', '[s]', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'] # fmt: skip
|
||||||
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
||||||
|
|
||||||
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
||||||
with open(self.vocab_file, "w", encoding="utf-8") as fp:
|
with open(cls.vocab_file, "w", encoding="utf-8") as fp:
|
||||||
fp.write(json.dumps(vocab_tokens) + "\n")
|
fp.write(json.dumps(vocab_tokens) + "\n")
|
||||||
|
|
||||||
def get_tokenizer(self, **kwargs):
|
@classmethod
|
||||||
return MgpstrTokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
@use_cache_if_possible
|
||||||
|
@lru_cache(maxsize=64)
|
||||||
|
def get_tokenizer(cls, pretrained_name=None, **kwargs):
|
||||||
|
pretrained_name = pretrained_name or cls.tmpdirname
|
||||||
|
return MgpstrTokenizer.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
def get_input_output_texts(self, tokenizer):
|
def get_input_output_texts(self, tokenizer):
|
||||||
input_text = "tester"
|
input_text = "tester"
|
||||||
|
@ -15,12 +15,13 @@
|
|||||||
|
|
||||||
|
|
||||||
import unittest
|
import unittest
|
||||||
|
from functools import lru_cache
|
||||||
from typing import Tuple
|
from typing import Tuple
|
||||||
|
|
||||||
from transformers.models.mluke.tokenization_mluke import MLukeTokenizer
|
from transformers.models.mluke.tokenization_mluke import MLukeTokenizer
|
||||||
from transformers.testing_utils import get_tests_dir, require_torch, slow
|
from transformers.testing_utils import get_tests_dir, require_torch, slow
|
||||||
|
|
||||||
from ...test_tokenization_common import TokenizerTesterMixin
|
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
|
||||||
|
|
||||||
|
|
||||||
SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
|
SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
|
||||||
@ -33,13 +34,17 @@ class MLukeTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
test_rust_tokenizer = False
|
test_rust_tokenizer = False
|
||||||
from_pretrained_kwargs = {"cls_token": "<s>"}
|
from_pretrained_kwargs = {"cls_token": "<s>"}
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
self.special_tokens_map = {"entity_token_1": "<ent>", "entity_token_2": "<ent2>"}
|
cls.special_tokens_map = {"entity_token_1": "<ent>", "entity_token_2": "<ent2>"}
|
||||||
|
|
||||||
def get_tokenizer(self, task=None, **kwargs):
|
@classmethod
|
||||||
kwargs.update(self.special_tokens_map)
|
@use_cache_if_possible
|
||||||
|
@lru_cache(maxsize=64)
|
||||||
|
def get_tokenizer(cls, task=None, **kwargs):
|
||||||
|
kwargs.update(cls.special_tokens_map)
|
||||||
kwargs.update({"task": task})
|
kwargs.update({"task": task})
|
||||||
tokenizer = MLukeTokenizer(vocab_file=SAMPLE_VOCAB, entity_vocab_file=SAMPLE_ENTITY_VOCAB, **kwargs)
|
tokenizer = MLukeTokenizer(vocab_file=SAMPLE_VOCAB, entity_vocab_file=SAMPLE_ENTITY_VOCAB, **kwargs)
|
||||||
return tokenizer
|
return tokenizer
|
||||||
@ -100,8 +105,8 @@ class MLukeTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
def test_embeded_special_tokens(self):
|
def test_embeded_special_tokens(self):
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)):
|
with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
sentence = "A, <mask> AllenNLP sentence."
|
sentence = "A, <mask> AllenNLP sentence."
|
||||||
tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
|
tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
|
||||||
tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
|
tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
|
||||||
|
@ -41,8 +41,9 @@ class MobileBERTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
from_pretrained_filter = filter_non_english
|
from_pretrained_filter = filter_non_english
|
||||||
pre_trained_model_path = "google/mobilebert-uncased"
|
pre_trained_model_path = "google/mobilebert-uncased"
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
vocab_tokens = [
|
vocab_tokens = [
|
||||||
"[UNK]",
|
"[UNK]",
|
||||||
@ -61,13 +62,13 @@ class MobileBERTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
"low",
|
"low",
|
||||||
"lowest",
|
"lowest",
|
||||||
]
|
]
|
||||||
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
||||||
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
|
with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
|
||||||
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
|
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
|
||||||
|
|
||||||
self.tokenizers_list = [
|
cls.tokenizers_list = [
|
||||||
(tokenizer_def[0], self.pre_trained_model_path, tokenizer_def[2]) # else the 'google/' prefix is stripped
|
(tokenizer_def[0], cls.pre_trained_model_path, tokenizer_def[2]) # else the 'google/' prefix is stripped
|
||||||
for tokenizer_def in self.tokenizers_list
|
for tokenizer_def in cls.tokenizers_list
|
||||||
]
|
]
|
||||||
|
|
||||||
# Copied from tests.models.bert.test_tokenization_bert.BertTokenizationTest.get_input_output_texts
|
# Copied from tests.models.bert.test_tokenization_bert.BertTokenizationTest.get_input_output_texts
|
||||||
@ -275,7 +276,7 @@ class MobileBERTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
def test_offsets_with_special_characters(self):
|
def test_offsets_with_special_characters(self):
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence."
|
sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence."
|
||||||
tokens = tokenizer_r.encode_plus(
|
tokens = tokenizer_r.encode_plus(
|
||||||
@ -331,8 +332,8 @@ class MobileBERTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
kwargs["tokenize_chinese_chars"] = True
|
kwargs["tokenize_chinese_chars"] = True
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)
|
ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)
|
||||||
ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
|
ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
|
||||||
@ -345,8 +346,8 @@ class MobileBERTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
self.assertListEqual(tokens_without_spe_char_r, list_of_commun_chinese_char)
|
self.assertListEqual(tokens_without_spe_char_r, list_of_commun_chinese_char)
|
||||||
|
|
||||||
kwargs["tokenize_chinese_chars"] = False
|
kwargs["tokenize_chinese_chars"] = False
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
|
ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
|
||||||
ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)
|
ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)
|
||||||
|
@ -51,8 +51,9 @@ class MoshiTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
test_rust_tokenizer = True
|
test_rust_tokenizer = True
|
||||||
from_pretrained_kwargs = {}
|
from_pretrained_kwargs = {}
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
# We have a SentencePiece fixture for testing
|
# We have a SentencePiece fixture for testing
|
||||||
tokenizer = PreTrainedTokenizerFast(
|
tokenizer = PreTrainedTokenizerFast(
|
||||||
@ -62,10 +63,11 @@ class MoshiTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
eos_token="</s>",
|
eos_token="</s>",
|
||||||
)
|
)
|
||||||
tokenizer.pad_token = tokenizer.eos_token
|
tokenizer.pad_token = tokenizer.eos_token
|
||||||
tokenizer.save_pretrained(self.tmpdirname)
|
tokenizer.save_pretrained(cls.tmpdirname)
|
||||||
|
|
||||||
def get_rust_tokenizer(self, **kwargs) -> PreTrainedTokenizerFast:
|
def get_rust_tokenizer(cls, pretrained_name=None, **kwargs) -> PreTrainedTokenizerFast:
|
||||||
return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
|
pretrained_name = pretrained_name or cls.tmpdirname
|
||||||
|
return cls.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
@unittest.skip(reason="No slow tokenizer")
|
@unittest.skip(reason="No slow tokenizer")
|
||||||
def test_added_tokens_serialization(self):
|
def test_added_tokens_serialization(self):
|
||||||
|
@ -32,8 +32,9 @@ class MPNetTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
test_rust_tokenizer = True
|
test_rust_tokenizer = True
|
||||||
space_between_special_tokens = True
|
space_between_special_tokens = True
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
vocab_tokens = [
|
vocab_tokens = [
|
||||||
"[UNK]",
|
"[UNK]",
|
||||||
@ -52,8 +53,8 @@ class MPNetTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
"low",
|
"low",
|
||||||
"lowest",
|
"lowest",
|
||||||
]
|
]
|
||||||
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
||||||
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
|
with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
|
||||||
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
|
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
|
||||||
|
|
||||||
def get_input_output_texts(self, tokenizer):
|
def get_input_output_texts(self, tokenizer):
|
||||||
|
@ -14,13 +14,14 @@
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import unittest
|
import unittest
|
||||||
|
from functools import lru_cache
|
||||||
|
|
||||||
from transformers import BatchEncoding, MvpTokenizer, MvpTokenizerFast
|
from transformers import BatchEncoding, MvpTokenizer, MvpTokenizerFast
|
||||||
from transformers.models.roberta.tokenization_roberta import VOCAB_FILES_NAMES
|
from transformers.models.roberta.tokenization_roberta import VOCAB_FILES_NAMES
|
||||||
from transformers.testing_utils import require_tokenizers, require_torch
|
from transformers.testing_utils import require_tokenizers, require_torch
|
||||||
from transformers.utils import cached_property
|
from transformers.utils import cached_property
|
||||||
|
|
||||||
from ...test_tokenization_common import TokenizerTesterMixin, filter_roberta_detectors
|
from ...test_tokenization_common import TokenizerTesterMixin, filter_roberta_detectors, use_cache_if_possible
|
||||||
|
|
||||||
|
|
||||||
@require_tokenizers
|
@require_tokenizers
|
||||||
@ -32,8 +33,10 @@ class TestTokenizationMvp(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
from_pretrained_filter = filter_roberta_detectors
|
from_pretrained_filter = filter_roberta_detectors
|
||||||
# from_pretrained_kwargs = {'add_prefix_space': True}
|
# from_pretrained_kwargs = {'add_prefix_space': True}
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
vocab = [
|
vocab = [
|
||||||
"l",
|
"l",
|
||||||
"o",
|
"o",
|
||||||
@ -58,22 +61,30 @@ class TestTokenizationMvp(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
]
|
]
|
||||||
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
||||||
merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
|
merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
|
||||||
self.special_tokens_map = {"unk_token": "<unk>"}
|
cls.special_tokens_map = {"unk_token": "<unk>"}
|
||||||
|
|
||||||
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
||||||
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
|
cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
|
||||||
with open(self.vocab_file, "w", encoding="utf-8") as fp:
|
with open(cls.vocab_file, "w", encoding="utf-8") as fp:
|
||||||
fp.write(json.dumps(vocab_tokens) + "\n")
|
fp.write(json.dumps(vocab_tokens) + "\n")
|
||||||
with open(self.merges_file, "w", encoding="utf-8") as fp:
|
with open(cls.merges_file, "w", encoding="utf-8") as fp:
|
||||||
fp.write("\n".join(merges))
|
fp.write("\n".join(merges))
|
||||||
|
|
||||||
def get_tokenizer(self, **kwargs):
|
@classmethod
|
||||||
kwargs.update(self.special_tokens_map)
|
@use_cache_if_possible
|
||||||
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
|
@lru_cache(maxsize=64)
|
||||||
|
def get_tokenizer(cls, pretrained_name=None, **kwargs):
|
||||||
|
kwargs.update(cls.special_tokens_map)
|
||||||
|
pretrained_name = pretrained_name or cls.tmpdirname
|
||||||
|
return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
def get_rust_tokenizer(self, **kwargs):
|
@classmethod
|
||||||
kwargs.update(self.special_tokens_map)
|
@use_cache_if_possible
|
||||||
return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
|
@lru_cache(maxsize=64)
|
||||||
|
def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
|
||||||
|
kwargs.update(cls.special_tokens_map)
|
||||||
|
pretrained_name = pretrained_name or cls.tmpdirname
|
||||||
|
return cls.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
def get_input_output_texts(self, tokenizer):
|
def get_input_output_texts(self, tokenizer):
|
||||||
return "lower newer", "lower newer"
|
return "lower newer", "lower newer"
|
||||||
@ -153,8 +164,8 @@ class TestTokenizationMvp(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
def test_embeded_special_tokens(self):
|
def test_embeded_special_tokens(self):
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
sentence = "A, <mask> AllenNLP sentence."
|
sentence = "A, <mask> AllenNLP sentence."
|
||||||
tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
|
tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
|
||||||
tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
|
tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
|
||||||
|
@ -16,6 +16,7 @@ import binascii
|
|||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
from transformers import MyT5Tokenizer
|
from transformers import MyT5Tokenizer
|
||||||
|
from transformers.testing_utils import slow
|
||||||
from transformers.utils import is_tf_available, is_torch_available
|
from transformers.utils import is_tf_available, is_torch_available
|
||||||
|
|
||||||
from ...test_tokenization_common import TokenizerTesterMixin
|
from ...test_tokenization_common import TokenizerTesterMixin
|
||||||
@ -86,15 +87,14 @@ class TestByteRewriter(unittest.TestCase):
|
|||||||
self.assertEqual(decompose_rewriter.rewrite_bytes(in_hex), out_hex)
|
self.assertEqual(decompose_rewriter.rewrite_bytes(in_hex), out_hex)
|
||||||
|
|
||||||
|
|
||||||
|
# This is way too slow, let's not run it on CircleCI. When trying to use cache, we get OOM and worker(s) crashed.
|
||||||
|
@slow
|
||||||
class MyT5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
class MyT5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||||
tokenizer_class = MyT5Tokenizer
|
tokenizer_class = MyT5Tokenizer
|
||||||
test_rust_tokenizer = False
|
test_rust_tokenizer = False
|
||||||
|
|
||||||
def setUp(self):
|
def get_tokenizer(cls, **kwargs) -> MyT5Tokenizer:
|
||||||
super().setUp()
|
return cls.tokenizer_class.from_pretrained("Tomlim/myt5-base", **kwargs)
|
||||||
|
|
||||||
def get_tokenizer(self, **kwargs) -> MyT5Tokenizer:
|
|
||||||
return self.tokenizer_class.from_pretrained("Tomlim/myt5-base", **kwargs)
|
|
||||||
|
|
||||||
@unittest.skip(reason="inputs cannot be pretokenized as ids depend on whole input string")
|
@unittest.skip(reason="inputs cannot be pretokenized as ids depend on whole input string")
|
||||||
def test_pretokenized_inputs(self):
|
def test_pretokenized_inputs(self):
|
||||||
|
@ -56,12 +56,13 @@ class NllbTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
test_sentencepiece = True
|
test_sentencepiece = True
|
||||||
from_pretrained_kwargs = {}
|
from_pretrained_kwargs = {}
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
# We have a SentencePiece fixture for testing
|
# We have a SentencePiece fixture for testing
|
||||||
tokenizer = NllbTokenizer(SAMPLE_VOCAB, keep_accents=True)
|
tokenizer = NllbTokenizer(SAMPLE_VOCAB, keep_accents=True)
|
||||||
tokenizer.save_pretrained(self.tmpdirname)
|
tokenizer.save_pretrained(cls.tmpdirname)
|
||||||
|
|
||||||
def test_full_tokenizer(self):
|
def test_full_tokenizer(self):
|
||||||
tokenizer = NllbTokenizer(SAMPLE_VOCAB, keep_accents=True)
|
tokenizer = NllbTokenizer(SAMPLE_VOCAB, keep_accents=True)
|
||||||
@ -143,8 +144,8 @@ class NllbTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
self.tokenizers_list[0] = (self.rust_tokenizer_class, "hf-internal-testing/tiny-random-nllb", {})
|
self.tokenizers_list[0] = (self.rust_tokenizer_class, "hf-internal-testing/tiny-random-nllb", {})
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
tmpdirname2 = tempfile.mkdtemp()
|
tmpdirname2 = tempfile.mkdtemp()
|
||||||
|
|
||||||
@ -262,7 +263,7 @@ class NllbTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
added_tokens = [AddedToken("<special>", lstrip=True)]
|
added_tokens = [AddedToken("<special>", lstrip=True)]
|
||||||
|
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
|
tokenizer_r = self.get_rust_tokenizer(
|
||||||
pretrained_name, additional_special_tokens=added_tokens, **kwargs
|
pretrained_name, additional_special_tokens=added_tokens, **kwargs
|
||||||
)
|
)
|
||||||
r_output = tokenizer_r.encode("Hey this is a <special> token")
|
r_output = tokenizer_r.encode("Hey this is a <special> token")
|
||||||
@ -272,7 +273,7 @@ class NllbTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
self.assertTrue(special_token_id in r_output)
|
self.assertTrue(special_token_id in r_output)
|
||||||
|
|
||||||
if self.test_slow_tokenizer:
|
if self.test_slow_tokenizer:
|
||||||
tokenizer_cr = self.rust_tokenizer_class.from_pretrained(
|
tokenizer_cr = self.get_rust_tokenizer(
|
||||||
pretrained_name,
|
pretrained_name,
|
||||||
additional_special_tokens=added_tokens,
|
additional_special_tokens=added_tokens,
|
||||||
**kwargs, # , from_slow=True <- unfortunately too slow to convert
|
**kwargs, # , from_slow=True <- unfortunately too slow to convert
|
||||||
|
@ -13,13 +13,15 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
|
import copy
|
||||||
import unittest
|
import unittest
|
||||||
|
from functools import lru_cache
|
||||||
|
|
||||||
from transformers import NougatTokenizerFast
|
from transformers import NougatTokenizerFast
|
||||||
from transformers.models.nougat.tokenization_nougat_fast import markdown_compatible, normalize_list_like_lines
|
from transformers.models.nougat.tokenization_nougat_fast import markdown_compatible, normalize_list_like_lines
|
||||||
from transformers.testing_utils import require_levenshtein, require_nltk, require_tokenizers
|
from transformers.testing_utils import require_levenshtein, require_nltk, require_tokenizers
|
||||||
|
|
||||||
from ...test_tokenization_common import TokenizerTesterMixin
|
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
|
||||||
|
|
||||||
|
|
||||||
@require_tokenizers
|
@require_tokenizers
|
||||||
@ -33,19 +35,26 @@ class NougatTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
from_pretrained_vocab_key = "tokenizer_file"
|
from_pretrained_vocab_key = "tokenizer_file"
|
||||||
special_tokens_map = {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
|
special_tokens_map = {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
tokenizer = NougatTokenizerFast.from_pretrained("facebook/nougat-base")
|
tokenizer = NougatTokenizerFast.from_pretrained("facebook/nougat-base")
|
||||||
tokenizer.save_pretrained(self.tmpdirname)
|
tokenizer.save_pretrained(cls.tmpdirname)
|
||||||
|
|
||||||
def get_rust_tokenizer(self, **kwargs):
|
@classmethod
|
||||||
kwargs.update(self.special_tokens_map)
|
@use_cache_if_possible
|
||||||
return NougatTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
|
@lru_cache(maxsize=64)
|
||||||
|
def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
|
||||||
|
_kwargs = copy.deepcopy(cls.special_tokens_map)
|
||||||
|
_kwargs.update(kwargs)
|
||||||
|
kwargs = _kwargs
|
||||||
|
pretrained_name = pretrained_name or cls.tmpdirname
|
||||||
|
return NougatTokenizerFast.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
def test_padding(self, max_length=6):
|
def test_padding(self, max_length=6):
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
# Simple input
|
# Simple input
|
||||||
sentence1 = "This is a simple input"
|
sentence1 = "This is a simple input"
|
||||||
sentence2 = ["This is a simple input 1", "This is a simple input 2"]
|
sentence2 = ["This is a simple input 1", "This is a simple input 2"]
|
||||||
|
@ -35,8 +35,9 @@ class OpenAIGPTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
test_rust_tokenizer = True
|
test_rust_tokenizer = True
|
||||||
test_seq2seq = False
|
test_seq2seq = False
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
|
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
|
||||||
vocab = [
|
vocab = [
|
||||||
@ -65,11 +66,11 @@ class OpenAIGPTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
||||||
merges = ["#version: 0.2", "l o", "lo w", "e r</w>", ""]
|
merges = ["#version: 0.2", "l o", "lo w", "e r</w>", ""]
|
||||||
|
|
||||||
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
||||||
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
|
cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
|
||||||
with open(self.vocab_file, "w") as fp:
|
with open(cls.vocab_file, "w") as fp:
|
||||||
fp.write(json.dumps(vocab_tokens))
|
fp.write(json.dumps(vocab_tokens))
|
||||||
with open(self.merges_file, "w") as fp:
|
with open(cls.merges_file, "w") as fp:
|
||||||
fp.write("\n".join(merges))
|
fp.write("\n".join(merges))
|
||||||
|
|
||||||
def get_input_output_texts(self, tokenizer):
|
def get_input_output_texts(self, tokenizer):
|
||||||
@ -90,7 +91,7 @@ class OpenAIGPTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
def test_padding(self, max_length=15):
|
def test_padding(self, max_length=15):
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
# Simple input
|
# Simple input
|
||||||
s = "This is a simple input"
|
s = "This is a simple input"
|
||||||
|
@ -13,12 +13,13 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
import unittest
|
import unittest
|
||||||
|
from functools import lru_cache
|
||||||
|
|
||||||
from transformers import PegasusTokenizer, PegasusTokenizerFast
|
from transformers import PegasusTokenizer, PegasusTokenizerFast
|
||||||
from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, require_torch, slow
|
from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, require_torch, slow
|
||||||
from transformers.utils import cached_property
|
from transformers.utils import cached_property
|
||||||
|
|
||||||
from ...test_tokenization_common import TokenizerTesterMixin
|
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
|
||||||
|
|
||||||
|
|
||||||
SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece_no_bos.model")
|
SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece_no_bos.model")
|
||||||
@ -33,19 +34,24 @@ class PegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
test_rust_tokenizer = True
|
test_rust_tokenizer = True
|
||||||
test_sentencepiece = True
|
test_sentencepiece = True
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
# We have a SentencePiece fixture for testing
|
# We have a SentencePiece fixture for testing
|
||||||
tokenizer = PegasusTokenizer(SAMPLE_VOCAB)
|
tokenizer = PegasusTokenizer(SAMPLE_VOCAB)
|
||||||
tokenizer.save_pretrained(self.tmpdirname)
|
tokenizer.save_pretrained(cls.tmpdirname)
|
||||||
|
|
||||||
@cached_property
|
@cached_property
|
||||||
def _large_tokenizer(self):
|
def _large_tokenizer(self):
|
||||||
return PegasusTokenizer.from_pretrained("google/pegasus-large")
|
return PegasusTokenizer.from_pretrained("google/pegasus-large")
|
||||||
|
|
||||||
def get_tokenizer(self, **kwargs) -> PegasusTokenizer:
|
@classmethod
|
||||||
return PegasusTokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
@use_cache_if_possible
|
||||||
|
@lru_cache(maxsize=64)
|
||||||
|
def get_tokenizer(cls, pretrained_name=None, **kwargs) -> PegasusTokenizer:
|
||||||
|
pretrained_name = pretrained_name or cls.tmpdirname
|
||||||
|
return PegasusTokenizer.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
def get_input_output_texts(self, tokenizer):
|
def get_input_output_texts(self, tokenizer):
|
||||||
return ("This is a test", "This is a test")
|
return ("This is a test", "This is a test")
|
||||||
@ -70,8 +76,8 @@ class PegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
self.assertEqual(self.get_tokenizer().vocab_size, 1_103)
|
self.assertEqual(self.get_tokenizer().vocab_size, 1_103)
|
||||||
|
|
||||||
def test_mask_tokens_rust_pegasus(self):
|
def test_mask_tokens_rust_pegasus(self):
|
||||||
rust_tokenizer = self.rust_tokenizer_class.from_pretrained(self.tmpdirname)
|
rust_tokenizer = self.get_rust_tokenizer(self.tmpdirname)
|
||||||
py_tokenizer = self.tokenizer_class.from_pretrained(self.tmpdirname)
|
py_tokenizer = self.get_tokenizer(self.tmpdirname)
|
||||||
raw_input_str = (
|
raw_input_str = (
|
||||||
"Let's see which <unk> is the better <unk_token_11> one <mask_1> It seems like this <mask_2> was important"
|
"Let's see which <unk> is the better <unk_token_11> one <mask_1> It seems like this <mask_2> was important"
|
||||||
" </s> <pad> <pad> <pad>"
|
" </s> <pad> <pad> <pad>"
|
||||||
@ -138,26 +144,31 @@ class BigBirdPegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
test_rust_tokenizer = True
|
test_rust_tokenizer = True
|
||||||
test_sentencepiece = True
|
test_sentencepiece = True
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
# We have a SentencePiece fixture for testing
|
# We have a SentencePiece fixture for testing
|
||||||
tokenizer = PegasusTokenizer(SAMPLE_VOCAB, offset=0, mask_token_sent=None, mask_token="[MASK]")
|
tokenizer = PegasusTokenizer(SAMPLE_VOCAB, offset=0, mask_token_sent=None, mask_token="[MASK]")
|
||||||
tokenizer.save_pretrained(self.tmpdirname)
|
tokenizer.save_pretrained(cls.tmpdirname)
|
||||||
|
|
||||||
@cached_property
|
@cached_property
|
||||||
def _large_tokenizer(self):
|
def _large_tokenizer(self):
|
||||||
return PegasusTokenizer.from_pretrained("google/bigbird-pegasus-large-arxiv")
|
return PegasusTokenizer.from_pretrained("google/bigbird-pegasus-large-arxiv")
|
||||||
|
|
||||||
def get_tokenizer(self, **kwargs) -> PegasusTokenizer:
|
@classmethod
|
||||||
return PegasusTokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
@use_cache_if_possible
|
||||||
|
@lru_cache(maxsize=64)
|
||||||
|
def get_tokenizer(cls, pretrained_name=None, **kwargs) -> PegasusTokenizer:
|
||||||
|
pretrained_name = pretrained_name or cls.tmpdirname
|
||||||
|
return PegasusTokenizer.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
def get_input_output_texts(self, tokenizer):
|
def get_input_output_texts(self, tokenizer):
|
||||||
return ("This is a test", "This is a test")
|
return ("This is a test", "This is a test")
|
||||||
|
|
||||||
def test_mask_tokens_rust_pegasus(self):
|
def test_mask_tokens_rust_pegasus(self):
|
||||||
rust_tokenizer = self.rust_tokenizer_class.from_pretrained(self.tmpdirname)
|
rust_tokenizer = self.get_rust_tokenizer(self.tmpdirname)
|
||||||
py_tokenizer = self.tokenizer_class.from_pretrained(self.tmpdirname)
|
py_tokenizer = self.get_tokenizer(self.tmpdirname)
|
||||||
raw_input_str = (
|
raw_input_str = (
|
||||||
"Let's see which <unk> is the better <unk_token> one [MASK] It seems like this [MASK] was important </s>"
|
"Let's see which <unk> is the better <unk_token> one [MASK] It seems like this [MASK] was important </s>"
|
||||||
" <pad> <pad> <pad>"
|
" <pad> <pad> <pad>"
|
||||||
|
@ -19,12 +19,13 @@ import re
|
|||||||
import shutil
|
import shutil
|
||||||
import tempfile
|
import tempfile
|
||||||
import unittest
|
import unittest
|
||||||
|
from functools import lru_cache
|
||||||
from typing import Tuple
|
from typing import Tuple
|
||||||
|
|
||||||
from transformers import AddedToken, BatchEncoding, PerceiverTokenizer
|
from transformers import AddedToken, BatchEncoding, PerceiverTokenizer
|
||||||
from transformers.utils import cached_property, is_tf_available, is_torch_available
|
from transformers.utils import cached_property, is_tf_available, is_torch_available
|
||||||
|
|
||||||
from ...test_tokenization_common import TokenizerTesterMixin
|
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
|
||||||
|
|
||||||
|
|
||||||
if is_torch_available():
|
if is_torch_available():
|
||||||
@ -40,17 +41,22 @@ class PerceiverTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
tokenizer_class = PerceiverTokenizer
|
tokenizer_class = PerceiverTokenizer
|
||||||
test_rust_tokenizer = False
|
test_rust_tokenizer = False
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
tokenizer = PerceiverTokenizer()
|
tokenizer = PerceiverTokenizer()
|
||||||
tokenizer.save_pretrained(self.tmpdirname)
|
tokenizer.save_pretrained(cls.tmpdirname)
|
||||||
|
|
||||||
@cached_property
|
@cached_property
|
||||||
def perceiver_tokenizer(self):
|
def perceiver_tokenizer(self):
|
||||||
return PerceiverTokenizer.from_pretrained("deepmind/language-perceiver")
|
return PerceiverTokenizer.from_pretrained("deepmind/language-perceiver")
|
||||||
|
|
||||||
def get_tokenizer(self, **kwargs) -> PerceiverTokenizer:
|
@classmethod
|
||||||
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
|
@use_cache_if_possible
|
||||||
|
@lru_cache(maxsize=64)
|
||||||
|
def get_tokenizer(cls, pretrained_name=None, **kwargs) -> PerceiverTokenizer:
|
||||||
|
pretrained_name = pretrained_name or cls.tmpdirname
|
||||||
|
return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
def get_clean_sequence(self, tokenizer, with_prefix_space=False, max_length=20, min_length=5) -> Tuple[str, list]:
|
def get_clean_sequence(self, tokenizer, with_prefix_space=False, max_length=20, min_length=5) -> Tuple[str, list]:
|
||||||
# XXX The default common tokenizer tests assume that every ID is decodable on its own.
|
# XXX The default common tokenizer tests assume that every ID is decodable on its own.
|
||||||
|
@ -15,10 +15,11 @@
|
|||||||
|
|
||||||
import os
|
import os
|
||||||
import unittest
|
import unittest
|
||||||
|
from functools import lru_cache
|
||||||
|
|
||||||
from transformers.models.phobert.tokenization_phobert import VOCAB_FILES_NAMES, PhobertTokenizer
|
from transformers.models.phobert.tokenization_phobert import VOCAB_FILES_NAMES, PhobertTokenizer
|
||||||
|
|
||||||
from ...test_tokenization_common import TokenizerTesterMixin
|
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
|
||||||
|
|
||||||
|
|
||||||
class PhobertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
class PhobertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||||
@ -26,27 +27,32 @@ class PhobertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
tokenizer_class = PhobertTokenizer
|
tokenizer_class = PhobertTokenizer
|
||||||
test_rust_tokenizer = False
|
test_rust_tokenizer = False
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
|
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
|
||||||
vocab = ["T@@", "i", "I", "R@@", "r", "e@@"]
|
vocab = ["T@@", "i", "I", "R@@", "r", "e@@"]
|
||||||
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
||||||
merges = ["#version: 0.2", "l à</w>"]
|
merges = ["#version: 0.2", "l à</w>"]
|
||||||
self.special_tokens_map = {"unk_token": "<unk>"}
|
cls.special_tokens_map = {"unk_token": "<unk>"}
|
||||||
|
|
||||||
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
||||||
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
|
cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
|
||||||
|
|
||||||
with open(self.vocab_file, "w", encoding="utf-8") as fp:
|
with open(cls.vocab_file, "w", encoding="utf-8") as fp:
|
||||||
for token in vocab_tokens:
|
for token in vocab_tokens:
|
||||||
fp.write(f"{token} {vocab_tokens[token]}\n")
|
fp.write(f"{token} {vocab_tokens[token]}\n")
|
||||||
with open(self.merges_file, "w", encoding="utf-8") as fp:
|
with open(cls.merges_file, "w", encoding="utf-8") as fp:
|
||||||
fp.write("\n".join(merges))
|
fp.write("\n".join(merges))
|
||||||
|
|
||||||
def get_tokenizer(self, **kwargs):
|
@classmethod
|
||||||
kwargs.update(self.special_tokens_map)
|
@use_cache_if_possible
|
||||||
return PhobertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
@lru_cache(maxsize=64)
|
||||||
|
def get_tokenizer(cls, pretrained_name=None, **kwargs):
|
||||||
|
kwargs.update(cls.special_tokens_map)
|
||||||
|
pretrained_name = pretrained_name or cls.tmpdirname
|
||||||
|
return PhobertTokenizer.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
def get_input_output_texts(self, tokenizer):
|
def get_input_output_texts(self, tokenizer):
|
||||||
input_text = "Tôi là VinAI Research"
|
input_text = "Tôi là VinAI Research"
|
||||||
|
@ -45,12 +45,13 @@ class PLBartTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
rust_tokenizer_class = None
|
rust_tokenizer_class = None
|
||||||
test_rust_tokenizer = False
|
test_rust_tokenizer = False
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
# We have a SentencePiece fixture for testing
|
# We have a SentencePiece fixture for testing
|
||||||
tokenizer = PLBartTokenizer(SAMPLE_VOCAB, language_codes="base", keep_accents=True)
|
tokenizer = PLBartTokenizer(SAMPLE_VOCAB, language_codes="base", keep_accents=True)
|
||||||
tokenizer.save_pretrained(self.tmpdirname)
|
tokenizer.save_pretrained(cls.tmpdirname)
|
||||||
|
|
||||||
def test_full_base_tokenizer(self):
|
def test_full_base_tokenizer(self):
|
||||||
tokenizer = PLBartTokenizer(SAMPLE_VOCAB, language_codes="base", keep_accents=True)
|
tokenizer = PLBartTokenizer(SAMPLE_VOCAB, language_codes="base", keep_accents=True)
|
||||||
|
@ -36,8 +36,9 @@ class ProphetNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
tokenizer_class = ProphetNetTokenizer
|
tokenizer_class = ProphetNetTokenizer
|
||||||
test_rust_tokenizer = False
|
test_rust_tokenizer = False
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
vocab_tokens = [
|
vocab_tokens = [
|
||||||
"[UNK]",
|
"[UNK]",
|
||||||
@ -56,8 +57,8 @@ class ProphetNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
"low",
|
"low",
|
||||||
"lowest",
|
"lowest",
|
||||||
]
|
]
|
||||||
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
||||||
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
|
with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
|
||||||
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
|
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
|
||||||
|
|
||||||
def get_input_output_texts(self, tokenizer):
|
def get_input_output_texts(self, tokenizer):
|
||||||
|
@ -14,15 +14,17 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
|
|
||||||
|
import copy
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import unittest
|
import unittest
|
||||||
|
from functools import lru_cache
|
||||||
|
|
||||||
from transformers import AddedToken, Qwen2Tokenizer, Qwen2TokenizerFast
|
from transformers import AddedToken, Qwen2Tokenizer, Qwen2TokenizerFast
|
||||||
from transformers.models.qwen2.tokenization_qwen2 import VOCAB_FILES_NAMES, bytes_to_unicode
|
from transformers.models.qwen2.tokenization_qwen2 import VOCAB_FILES_NAMES, bytes_to_unicode
|
||||||
from transformers.testing_utils import require_tokenizers, slow
|
from transformers.testing_utils import require_tokenizers, slow
|
||||||
|
|
||||||
from ...test_tokenization_common import TokenizerTesterMixin
|
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
|
||||||
|
|
||||||
|
|
||||||
@require_tokenizers
|
@require_tokenizers
|
||||||
@ -36,8 +38,9 @@ class Qwen2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
from_pretrained_kwargs = None
|
from_pretrained_kwargs = None
|
||||||
test_seq2seq = False
|
test_seq2seq = False
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
# this make sure the vocabuary is complete at the byte level.
|
# this make sure the vocabuary is complete at the byte level.
|
||||||
vocab = list(bytes_to_unicode().values())
|
vocab = list(bytes_to_unicode().values())
|
||||||
@ -81,22 +84,34 @@ class Qwen2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
"# #",
|
"# #",
|
||||||
]
|
]
|
||||||
|
|
||||||
self.special_tokens_map = {"eos_token": "<|endoftext|>"}
|
cls.special_tokens_map = {"eos_token": "<|endoftext|>"}
|
||||||
|
|
||||||
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
||||||
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
|
cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
|
||||||
with open(self.vocab_file, "w", encoding="utf-8") as fp:
|
with open(cls.vocab_file, "w", encoding="utf-8") as fp:
|
||||||
fp.write(json.dumps(vocab_tokens) + "\n")
|
fp.write(json.dumps(vocab_tokens) + "\n")
|
||||||
with open(self.merges_file, "w", encoding="utf-8") as fp:
|
with open(cls.merges_file, "w", encoding="utf-8") as fp:
|
||||||
fp.write("\n".join(merges))
|
fp.write("\n".join(merges))
|
||||||
|
|
||||||
def get_tokenizer(self, **kwargs):
|
@classmethod
|
||||||
kwargs.update(self.special_tokens_map)
|
@use_cache_if_possible
|
||||||
return Qwen2Tokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
@lru_cache(maxsize=64)
|
||||||
|
def get_tokenizer(cls, pretrained_name=None, **kwargs):
|
||||||
|
_kwargs = copy.deepcopy(cls.special_tokens_map)
|
||||||
|
_kwargs.update(kwargs)
|
||||||
|
kwargs = _kwargs
|
||||||
|
pretrained_name = pretrained_name or cls.tmpdirname
|
||||||
|
return Qwen2Tokenizer.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
def get_rust_tokenizer(self, **kwargs):
|
@classmethod
|
||||||
kwargs.update(self.special_tokens_map)
|
@use_cache_if_possible
|
||||||
return Qwen2TokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
|
@lru_cache(maxsize=64)
|
||||||
|
def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
|
||||||
|
_kwargs = copy.deepcopy(cls.special_tokens_map)
|
||||||
|
_kwargs.update(kwargs)
|
||||||
|
kwargs = _kwargs
|
||||||
|
pretrained_name = pretrained_name or cls.tmpdirname
|
||||||
|
return Qwen2TokenizerFast.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
def get_input_output_texts(self, tokenizer):
|
def get_input_output_texts(self, tokenizer):
|
||||||
# this case should cover
|
# this case should cover
|
||||||
|
@ -34,11 +34,12 @@ class ReformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
test_seq2seq = False
|
test_seq2seq = False
|
||||||
test_sentencepiece = True
|
test_sentencepiece = True
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
tokenizer = ReformerTokenizer(SAMPLE_VOCAB, keep_accents=True)
|
tokenizer = ReformerTokenizer(SAMPLE_VOCAB, keep_accents=True)
|
||||||
tokenizer.save_pretrained(self.tmpdirname)
|
tokenizer.save_pretrained(cls.tmpdirname)
|
||||||
|
|
||||||
def test_convert_token_and_id(self):
|
def test_convert_token_and_id(self):
|
||||||
"""Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
|
"""Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
|
||||||
@ -84,7 +85,7 @@ class ReformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
def test_padding(self, max_length=15):
|
def test_padding(self, max_length=15):
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
# Simple input
|
# Simple input
|
||||||
s = "This is a simple input"
|
s = "This is a simple input"
|
||||||
|
@ -39,11 +39,12 @@ class RemBertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
test_sentencepiece_ignore_case = True
|
test_sentencepiece_ignore_case = True
|
||||||
pre_trained_model_path = "google/rembert"
|
pre_trained_model_path = "google/rembert"
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
tokenizer = RemBertTokenizer(SAMPLE_VOCAB)
|
tokenizer = RemBertTokenizer(SAMPLE_VOCAB)
|
||||||
tokenizer.save_pretrained(self.tmpdirname)
|
tokenizer.save_pretrained(cls.tmpdirname)
|
||||||
|
|
||||||
# Copied from ReformerTokenizationTest.get_input_output_texts
|
# Copied from ReformerTokenizationTest.get_input_output_texts
|
||||||
def get_input_output_texts(self, tokenizer):
|
def get_input_output_texts(self, tokenizer):
|
||||||
@ -222,7 +223,7 @@ class RemBertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
|
|
||||||
with self.subTest("Hub -> Fast: Test loading a fast tokenizer from the hub)"):
|
with self.subTest("Hub -> Fast: Test loading a fast tokenizer from the hub)"):
|
||||||
if self.rust_tokenizer_class is not None:
|
if self.rust_tokenizer_class is not None:
|
||||||
tokenizer_fast = self.rust_tokenizer_class.from_pretrained(pretrained_name, eos_token=new_eos)
|
tokenizer_fast = self.get_rust_tokenizer(pretrained_name, eos_token=new_eos)
|
||||||
self.assertEqual(tokenizer_fast._special_tokens_map["eos_token"], new_eos)
|
self.assertEqual(tokenizer_fast._special_tokens_map["eos_token"], new_eos)
|
||||||
self.assertIn(new_eos, list(tokenizer_fast.added_tokens_decoder.values()))
|
self.assertIn(new_eos, list(tokenizer_fast.added_tokens_decoder.values()))
|
||||||
# We can't test the following because for BC we kept the default rstrip lstrip in slow not fast. Will comment once normalization is alright
|
# We can't test the following because for BC we kept the default rstrip lstrip in slow not fast. Will comment once normalization is alright
|
||||||
|
@ -18,12 +18,13 @@ import itertools
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import unittest
|
import unittest
|
||||||
|
from functools import lru_cache
|
||||||
|
|
||||||
from transformers import AddedToken, RobertaTokenizer, RobertaTokenizerFast
|
from transformers import AddedToken, RobertaTokenizer, RobertaTokenizerFast
|
||||||
from transformers.models.roberta.tokenization_roberta import VOCAB_FILES_NAMES
|
from transformers.models.roberta.tokenization_roberta import VOCAB_FILES_NAMES
|
||||||
from transformers.testing_utils import require_tokenizers, slow
|
from transformers.testing_utils import require_tokenizers, slow
|
||||||
|
|
||||||
from ...test_tokenization_common import TokenizerTesterMixin
|
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
|
||||||
|
|
||||||
|
|
||||||
@require_tokenizers
|
@require_tokenizers
|
||||||
@ -34,8 +35,9 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
test_rust_tokenizer = True
|
test_rust_tokenizer = True
|
||||||
from_pretrained_kwargs = {"cls_token": "<s>"}
|
from_pretrained_kwargs = {"cls_token": "<s>"}
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
|
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
|
||||||
vocab = [
|
vocab = [
|
||||||
@ -62,22 +64,30 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
]
|
]
|
||||||
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
||||||
merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
|
merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
|
||||||
self.special_tokens_map = {"unk_token": "<unk>"}
|
cls.special_tokens_map = {"unk_token": "<unk>"}
|
||||||
|
|
||||||
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
||||||
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
|
cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
|
||||||
with open(self.vocab_file, "w", encoding="utf-8") as fp:
|
with open(cls.vocab_file, "w", encoding="utf-8") as fp:
|
||||||
fp.write(json.dumps(vocab_tokens) + "\n")
|
fp.write(json.dumps(vocab_tokens) + "\n")
|
||||||
with open(self.merges_file, "w", encoding="utf-8") as fp:
|
with open(cls.merges_file, "w", encoding="utf-8") as fp:
|
||||||
fp.write("\n".join(merges))
|
fp.write("\n".join(merges))
|
||||||
|
|
||||||
def get_tokenizer(self, **kwargs):
|
@classmethod
|
||||||
kwargs.update(self.special_tokens_map)
|
@use_cache_if_possible
|
||||||
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
|
@lru_cache(maxsize=64)
|
||||||
|
def get_tokenizer(cls, pretrained_name=None, **kwargs):
|
||||||
|
kwargs.update(cls.special_tokens_map)
|
||||||
|
pretrained_name = pretrained_name or cls.tmpdirname
|
||||||
|
return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
def get_rust_tokenizer(self, **kwargs):
|
@classmethod
|
||||||
kwargs.update(self.special_tokens_map)
|
@use_cache_if_possible
|
||||||
return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
|
@lru_cache(maxsize=64)
|
||||||
|
def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
|
||||||
|
kwargs.update(cls.special_tokens_map)
|
||||||
|
pretrained_name = pretrained_name or cls.tmpdirname
|
||||||
|
return cls.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
def get_input_output_texts(self, tokenizer):
|
def get_input_output_texts(self, tokenizer):
|
||||||
input_text = "lower newer"
|
input_text = "lower newer"
|
||||||
@ -171,8 +181,8 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
def test_embeded_special_tokens(self):
|
def test_embeded_special_tokens(self):
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
sentence = "A, <mask> AllenNLP sentence."
|
sentence = "A, <mask> AllenNLP sentence."
|
||||||
tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
|
tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
|
||||||
tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
|
tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
|
||||||
@ -202,7 +212,7 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
|
|
||||||
def test_change_add_prefix_space_and_trim_offsets_args(self):
|
def test_change_add_prefix_space_and_trim_offsets_args(self):
|
||||||
for trim_offsets, add_prefix_space in itertools.product([True, False], repeat=2):
|
for trim_offsets, add_prefix_space in itertools.product([True, False], repeat=2):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
|
tokenizer_r = self.get_rust_tokenizer(
|
||||||
self.tmpdirname, use_fast=True, add_prefix_space=add_prefix_space, trim_offsets=trim_offsets
|
self.tmpdirname, use_fast=True, add_prefix_space=add_prefix_space, trim_offsets=trim_offsets
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -222,7 +232,7 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
text_of_1_token = "hello" # `hello` is a token in the vocabulary of `pretrained_name`
|
text_of_1_token = "hello" # `hello` is a token in the vocabulary of `pretrained_name`
|
||||||
text = f"{text_of_1_token} {text_of_1_token}"
|
text = f"{text_of_1_token} {text_of_1_token}"
|
||||||
|
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
|
tokenizer_r = self.get_rust_tokenizer(
|
||||||
pretrained_name, use_fast=True, add_prefix_space=True, trim_offsets=True
|
pretrained_name, use_fast=True, add_prefix_space=True, trim_offsets=True
|
||||||
)
|
)
|
||||||
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
|
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
|
||||||
@ -232,7 +242,7 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
(len(text_of_1_token) + 1, len(text_of_1_token) + 1 + len(text_of_1_token)),
|
(len(text_of_1_token) + 1, len(text_of_1_token) + 1 + len(text_of_1_token)),
|
||||||
)
|
)
|
||||||
|
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
|
tokenizer_r = self.get_rust_tokenizer(
|
||||||
pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=True
|
pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=True
|
||||||
)
|
)
|
||||||
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
|
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
|
||||||
@ -242,7 +252,7 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
(len(text_of_1_token) + 1, len(text_of_1_token) + 1 + len(text_of_1_token)),
|
(len(text_of_1_token) + 1, len(text_of_1_token) + 1 + len(text_of_1_token)),
|
||||||
)
|
)
|
||||||
|
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
|
tokenizer_r = self.get_rust_tokenizer(
|
||||||
pretrained_name, use_fast=True, add_prefix_space=True, trim_offsets=False
|
pretrained_name, use_fast=True, add_prefix_space=True, trim_offsets=False
|
||||||
)
|
)
|
||||||
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
|
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
|
||||||
@ -252,7 +262,7 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
(len(text_of_1_token), len(text_of_1_token) + 1 + len(text_of_1_token)),
|
(len(text_of_1_token), len(text_of_1_token) + 1 + len(text_of_1_token)),
|
||||||
)
|
)
|
||||||
|
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
|
tokenizer_r = self.get_rust_tokenizer(
|
||||||
pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=False
|
pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=False
|
||||||
)
|
)
|
||||||
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
|
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
|
||||||
@ -274,7 +284,7 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
# (1 + len(text_of_1_token) + 1, 1 + len(text_of_1_token) + 1 + len(text_of_1_token)),
|
# (1 + len(text_of_1_token) + 1, 1 + len(text_of_1_token) + 1 + len(text_of_1_token)),
|
||||||
# )
|
# )
|
||||||
|
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
|
tokenizer_r = self.get_rust_tokenizer(
|
||||||
pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=True
|
pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=True
|
||||||
)
|
)
|
||||||
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
|
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
|
||||||
@ -284,7 +294,7 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
(1 + len(text_of_1_token) + 1, 1 + len(text_of_1_token) + 1 + len(text_of_1_token)),
|
(1 + len(text_of_1_token) + 1, 1 + len(text_of_1_token) + 1 + len(text_of_1_token)),
|
||||||
)
|
)
|
||||||
|
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
|
tokenizer_r = self.get_rust_tokenizer(
|
||||||
pretrained_name, use_fast=True, add_prefix_space=True, trim_offsets=False
|
pretrained_name, use_fast=True, add_prefix_space=True, trim_offsets=False
|
||||||
)
|
)
|
||||||
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
|
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
|
||||||
@ -294,7 +304,7 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
(1 + len(text_of_1_token), 1 + len(text_of_1_token) + 1 + len(text_of_1_token)),
|
(1 + len(text_of_1_token), 1 + len(text_of_1_token) + 1 + len(text_of_1_token)),
|
||||||
)
|
)
|
||||||
|
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
|
tokenizer_r = self.get_rust_tokenizer(
|
||||||
pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=False
|
pretrained_name, use_fast=True, add_prefix_space=False, trim_offsets=False
|
||||||
)
|
)
|
||||||
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
|
encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
|
||||||
|
@ -41,8 +41,9 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
space_between_special_tokens = True
|
space_between_special_tokens = True
|
||||||
from_pretrained_filter = filter_non_english
|
from_pretrained_filter = filter_non_english
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]", "你", "好", "是", "谁", "a", "b", "c", "d"]
|
vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]", "你", "好", "是", "谁", "a", "b", "c", "d"]
|
||||||
word_shape = {}
|
word_shape = {}
|
||||||
@ -50,14 +51,14 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
for i, value in enumerate(vocab_tokens):
|
for i, value in enumerate(vocab_tokens):
|
||||||
word_shape[value] = i
|
word_shape[value] = i
|
||||||
word_pronunciation[value] = i
|
word_pronunciation[value] = i
|
||||||
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
||||||
self.word_shape_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["word_shape_file"])
|
cls.word_shape_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["word_shape_file"])
|
||||||
self.word_pronunciation_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["word_pronunciation_file"])
|
cls.word_pronunciation_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["word_pronunciation_file"])
|
||||||
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
|
with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
|
||||||
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
|
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
|
||||||
with open(self.word_shape_file, "w", encoding="utf-8") as word_shape_writer:
|
with open(cls.word_shape_file, "w", encoding="utf-8") as word_shape_writer:
|
||||||
json.dump(word_shape, word_shape_writer, ensure_ascii=False)
|
json.dump(word_shape, word_shape_writer, ensure_ascii=False)
|
||||||
with open(self.word_pronunciation_file, "w", encoding="utf-8") as word_pronunciation_writer:
|
with open(cls.word_pronunciation_file, "w", encoding="utf-8") as word_pronunciation_writer:
|
||||||
json.dump(word_pronunciation, word_pronunciation_writer, ensure_ascii=False)
|
json.dump(word_pronunciation, word_pronunciation_writer, ensure_ascii=False)
|
||||||
|
|
||||||
def test_full_tokenizer(self):
|
def test_full_tokenizer(self):
|
||||||
@ -204,7 +205,7 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
def test_offsets_with_special_characters(self):
|
def test_offsets_with_special_characters(self):
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence."
|
sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence."
|
||||||
tokens = tokenizer_r.encode_plus(
|
tokens = tokenizer_r.encode_plus(
|
||||||
@ -260,8 +261,8 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
kwargs["tokenize_chinese_chars"] = True
|
kwargs["tokenize_chinese_chars"] = True
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)
|
ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)
|
||||||
ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
|
ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
|
||||||
@ -274,8 +275,8 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
self.assertListEqual(tokens_without_spe_char_r, list_of_commun_chinese_char)
|
self.assertListEqual(tokens_without_spe_char_r, list_of_commun_chinese_char)
|
||||||
|
|
||||||
kwargs["tokenize_chinese_chars"] = False
|
kwargs["tokenize_chinese_chars"] = False
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
|
ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
|
||||||
ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)
|
ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)
|
||||||
|
@ -15,11 +15,12 @@
|
|||||||
|
|
||||||
import tempfile
|
import tempfile
|
||||||
import unittest
|
import unittest
|
||||||
|
from functools import lru_cache
|
||||||
|
|
||||||
from transformers import RoFormerTokenizer, RoFormerTokenizerFast
|
from transformers import RoFormerTokenizer, RoFormerTokenizerFast
|
||||||
from transformers.testing_utils import require_rjieba, require_tokenizers
|
from transformers.testing_utils import require_rjieba, require_tokenizers
|
||||||
|
|
||||||
from ...test_tokenization_common import TokenizerTesterMixin
|
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
|
||||||
|
|
||||||
|
|
||||||
@require_rjieba
|
@require_rjieba
|
||||||
@ -31,14 +32,25 @@ class RoFormerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
space_between_special_tokens = True
|
space_between_special_tokens = True
|
||||||
test_rust_tokenizer = True
|
test_rust_tokenizer = True
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
tokenizer = cls.tokenizer_class.from_pretrained("junnyu/roformer_chinese_base")
|
||||||
|
tokenizer.save_pretrained(cls.tmpdirname)
|
||||||
|
|
||||||
def get_tokenizer(self, **kwargs):
|
@classmethod
|
||||||
return self.tokenizer_class.from_pretrained("junnyu/roformer_chinese_base", **kwargs)
|
@use_cache_if_possible
|
||||||
|
@lru_cache(maxsize=64)
|
||||||
|
def get_tokenizer(cls, pretrained_name=None, **kwargs):
|
||||||
|
pretrained_name = pretrained_name or cls.tmpdirname
|
||||||
|
return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
def get_rust_tokenizer(self, **kwargs):
|
@classmethod
|
||||||
return self.rust_tokenizer_class.from_pretrained("junnyu/roformer_chinese_base", **kwargs)
|
@use_cache_if_possible
|
||||||
|
@lru_cache(maxsize=64)
|
||||||
|
def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
|
||||||
|
pretrained_name = pretrained_name or cls.tmpdirname
|
||||||
|
return cls.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
def get_chinese_input_output_texts(self):
|
def get_chinese_input_output_texts(self):
|
||||||
input_text = "永和服装饰品有限公司,今天天气非常好"
|
input_text = "永和服装饰品有限公司,今天天气非常好"
|
||||||
|
@ -59,12 +59,13 @@ class SeamlessM4TTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
test_sentencepiece = True
|
test_sentencepiece = True
|
||||||
from_pretrained_kwargs = {}
|
from_pretrained_kwargs = {}
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
# We have a SentencePiece fixture for testing
|
# We have a SentencePiece fixture for testing
|
||||||
tokenizer = SeamlessM4TTokenizer(SAMPLE_VOCAB, keep_accents=True)
|
tokenizer = SeamlessM4TTokenizer(SAMPLE_VOCAB, keep_accents=True)
|
||||||
tokenizer.save_pretrained(self.tmpdirname)
|
tokenizer.save_pretrained(cls.tmpdirname)
|
||||||
|
|
||||||
def test_full_tokenizer(self):
|
def test_full_tokenizer(self):
|
||||||
tokenizer = SeamlessM4TTokenizer(SAMPLE_VOCAB, keep_accents=True)
|
tokenizer = SeamlessM4TTokenizer(SAMPLE_VOCAB, keep_accents=True)
|
||||||
@ -353,7 +354,7 @@ class SeamlessM4TTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
added_tokens = [AddedToken("<special>", lstrip=True)]
|
added_tokens = [AddedToken("<special>", lstrip=True)]
|
||||||
|
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
|
tokenizer_r = self.get_rust_tokenizer(
|
||||||
pretrained_name, additional_special_tokens=added_tokens, **kwargs
|
pretrained_name, additional_special_tokens=added_tokens, **kwargs
|
||||||
)
|
)
|
||||||
r_output = tokenizer_r.encode("Hey this is a <special> token")
|
r_output = tokenizer_r.encode("Hey this is a <special> token")
|
||||||
@ -363,7 +364,7 @@ class SeamlessM4TTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
self.assertTrue(special_token_id in r_output)
|
self.assertTrue(special_token_id in r_output)
|
||||||
|
|
||||||
if self.test_slow_tokenizer:
|
if self.test_slow_tokenizer:
|
||||||
tokenizer_cr = self.rust_tokenizer_class.from_pretrained(
|
tokenizer_cr = self.get_rust_tokenizer(
|
||||||
pretrained_name,
|
pretrained_name,
|
||||||
additional_special_tokens=added_tokens,
|
additional_special_tokens=added_tokens,
|
||||||
**kwargs, # , from_slow=True <- unfortunately too slow to convert
|
**kwargs, # , from_slow=True <- unfortunately too slow to convert
|
||||||
|
@ -17,12 +17,13 @@ import json
|
|||||||
import os
|
import os
|
||||||
import tempfile
|
import tempfile
|
||||||
import unittest
|
import unittest
|
||||||
|
from functools import lru_cache
|
||||||
|
|
||||||
from transformers import SPIECE_UNDERLINE, AddedToken, BatchEncoding, SiglipTokenizer
|
from transformers import SPIECE_UNDERLINE, AddedToken, BatchEncoding, SiglipTokenizer
|
||||||
from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, slow
|
from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, slow
|
||||||
from transformers.utils import cached_property, is_tf_available, is_torch_available
|
from transformers.utils import cached_property, is_tf_available, is_torch_available
|
||||||
|
|
||||||
from ...test_tokenization_common import TokenizerTesterMixin
|
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
|
||||||
|
|
||||||
|
|
||||||
SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
|
SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
|
||||||
@ -44,13 +45,13 @@ class SiglipTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
test_sentencepiece = True
|
test_sentencepiece = True
|
||||||
test_sentencepiece_ignore_case = True
|
test_sentencepiece_ignore_case = True
|
||||||
|
|
||||||
# Copied from tests.models.t5.test_tokenization_t5.T5TokenizationTest.setUp with T5->Siglip
|
@classmethod
|
||||||
def setUp(self):
|
def setUpClass(cls):
|
||||||
super().setUp()
|
super().setUpClass()
|
||||||
|
|
||||||
# We have a SentencePiece fixture for testing
|
# We have a SentencePiece fixture for testing
|
||||||
tokenizer = SiglipTokenizer(SAMPLE_VOCAB)
|
tokenizer = SiglipTokenizer(SAMPLE_VOCAB)
|
||||||
tokenizer.save_pretrained(self.tmpdirname)
|
tokenizer.save_pretrained(cls.tmpdirname)
|
||||||
|
|
||||||
# Copied from tests.models.t5.test_tokenization_t5.T5TokenizationTest.test_convert_token_and_id with T5->Siglip
|
# Copied from tests.models.t5.test_tokenization_t5.T5TokenizationTest.test_convert_token_and_id with T5->Siglip
|
||||||
def test_convert_token_and_id(self):
|
def test_convert_token_and_id(self):
|
||||||
@ -135,9 +136,12 @@ class SiglipTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
def siglip_tokenizer(self):
|
def siglip_tokenizer(self):
|
||||||
return SiglipTokenizer.from_pretrained("google/siglip-base-patch16-224")
|
return SiglipTokenizer.from_pretrained("google/siglip-base-patch16-224")
|
||||||
|
|
||||||
# Copied from tests.models.t5.test_tokenization_t5.T5TokenizationTest.get_tokenizer with T5->Siglip
|
@classmethod
|
||||||
def get_tokenizer(self, **kwargs) -> SiglipTokenizer:
|
@use_cache_if_possible
|
||||||
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
|
@lru_cache(maxsize=64)
|
||||||
|
def get_tokenizer(cls, pretrained_name=None, **kwargs) -> SiglipTokenizer:
|
||||||
|
pretrained_name = pretrained_name or cls.tmpdirname
|
||||||
|
return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
# Copied from tests.models.t5.test_tokenization_t5.T5TokenizationTest.test_rust_and_python_full_tokenizers with T5->Siglip
|
# Copied from tests.models.t5.test_tokenization_t5.T5TokenizationTest.test_rust_and_python_full_tokenizers with T5->Siglip
|
||||||
def test_rust_and_python_full_tokenizers(self):
|
def test_rust_and_python_full_tokenizers(self):
|
||||||
@ -227,10 +231,10 @@ class SiglipTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
added_tokens = [f"<extra_id_{i}>" for i in range(100)] + [AddedToken("<special>", lstrip=True)]
|
added_tokens = [f"<extra_id_{i}>" for i in range(100)] + [AddedToken("<special>", lstrip=True)]
|
||||||
|
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
|
tokenizer_r = self.get_rust_tokenizer(
|
||||||
pretrained_name, additional_special_tokens=added_tokens, **kwargs
|
pretrained_name, additional_special_tokens=added_tokens, **kwargs
|
||||||
)
|
)
|
||||||
tokenizer_cr = self.rust_tokenizer_class.from_pretrained(
|
tokenizer_cr = self.get_rust_tokenizer(
|
||||||
pretrained_name, additional_special_tokens=added_tokens, **kwargs, from_slow=True
|
pretrained_name, additional_special_tokens=added_tokens, **kwargs, from_slow=True
|
||||||
)
|
)
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(
|
tokenizer_p = self.tokenizer_class.from_pretrained(
|
||||||
|
@ -42,8 +42,9 @@ class SpeechToTextTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
test_rust_tokenizer = False
|
test_rust_tokenizer = False
|
||||||
test_sentencepiece = True
|
test_sentencepiece = True
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
spm_model = sp.SentencePieceProcessor()
|
spm_model = sp.SentencePieceProcessor()
|
||||||
spm_model.Load(SAMPLE_VOCAB)
|
spm_model.Load(SAMPLE_VOCAB)
|
||||||
@ -52,13 +53,13 @@ class SpeechToTextTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
vocab += [spm_model.IdToPiece(id_) for id_ in range(len(spm_model))]
|
vocab += [spm_model.IdToPiece(id_) for id_ in range(len(spm_model))]
|
||||||
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
||||||
|
|
||||||
save_dir = Path(self.tmpdirname)
|
save_dir = Path(cls.tmpdirname)
|
||||||
save_json(vocab_tokens, save_dir / VOCAB_FILES_NAMES["vocab_file"])
|
save_json(vocab_tokens, save_dir / VOCAB_FILES_NAMES["vocab_file"])
|
||||||
if not (save_dir / VOCAB_FILES_NAMES["spm_file"]).exists():
|
if not (save_dir / VOCAB_FILES_NAMES["spm_file"]).exists():
|
||||||
copyfile(SAMPLE_VOCAB, save_dir / VOCAB_FILES_NAMES["spm_file"])
|
copyfile(SAMPLE_VOCAB, save_dir / VOCAB_FILES_NAMES["spm_file"])
|
||||||
|
|
||||||
tokenizer = Speech2TextTokenizer.from_pretrained(self.tmpdirname)
|
tokenizer = Speech2TextTokenizer.from_pretrained(cls.tmpdirname)
|
||||||
tokenizer.save_pretrained(self.tmpdirname)
|
tokenizer.save_pretrained(cls.tmpdirname)
|
||||||
|
|
||||||
def test_convert_token_and_id(self):
|
def test_convert_token_and_id(self):
|
||||||
"""Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
|
"""Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
|
||||||
|
@ -35,8 +35,9 @@ class SpeechT5TokenizerTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
test_rust_tokenizer = False
|
test_rust_tokenizer = False
|
||||||
test_sentencepiece = True
|
test_sentencepiece = True
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
# We have a SentencePiece fixture for testing
|
# We have a SentencePiece fixture for testing
|
||||||
tokenizer = SpeechT5Tokenizer(SAMPLE_VOCAB)
|
tokenizer = SpeechT5Tokenizer(SAMPLE_VOCAB)
|
||||||
@ -46,7 +47,7 @@ class SpeechT5TokenizerTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
tokenizer.add_special_tokens({"mask_token": mask_token})
|
tokenizer.add_special_tokens({"mask_token": mask_token})
|
||||||
tokenizer.add_tokens(["<ctc_blank>"])
|
tokenizer.add_tokens(["<ctc_blank>"])
|
||||||
|
|
||||||
tokenizer.save_pretrained(self.tmpdirname)
|
tokenizer.save_pretrained(cls.tmpdirname)
|
||||||
|
|
||||||
def get_input_output_texts(self, tokenizer):
|
def get_input_output_texts(self, tokenizer):
|
||||||
input_text = "this is a test"
|
input_text = "this is a test"
|
||||||
|
@ -13,8 +13,9 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
import unittest
|
import unittest
|
||||||
|
from functools import lru_cache
|
||||||
|
|
||||||
from tests.test_tokenization_common import TokenizerTesterMixin
|
from tests.test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
|
||||||
from transformers import SplinterTokenizerFast, is_tf_available, is_torch_available
|
from transformers import SplinterTokenizerFast, is_tf_available, is_torch_available
|
||||||
from transformers.models.splinter import SplinterTokenizer
|
from transformers.models.splinter import SplinterTokenizer
|
||||||
from transformers.testing_utils import get_tests_dir, slow
|
from transformers.testing_utils import get_tests_dir, slow
|
||||||
@ -40,20 +41,29 @@ class SplinterTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
pre_trained_model_path = "tau/splinter-base"
|
pre_trained_model_path = "tau/splinter-base"
|
||||||
|
|
||||||
# Copied from transformers.models.siglip.SiglipTokenizationTest.setUp
|
# Copied from transformers.models.siglip.SiglipTokenizationTest.setUp
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
tokenizer = SplinterTokenizer(SAMPLE_VOCAB)
|
tokenizer = SplinterTokenizer(SAMPLE_VOCAB)
|
||||||
tokenizer.vocab["[UNK]"] = len(tokenizer.vocab)
|
tokenizer.vocab["[UNK]"] = len(tokenizer.vocab)
|
||||||
tokenizer.vocab["[QUESTION]"] = len(tokenizer.vocab)
|
tokenizer.vocab["[QUESTION]"] = len(tokenizer.vocab)
|
||||||
tokenizer.vocab["."] = len(tokenizer.vocab)
|
tokenizer.vocab["."] = len(tokenizer.vocab)
|
||||||
tokenizer.add_tokens("this is a test thou shall not determine rigor truly".split())
|
tokenizer.add_tokens("this is a test thou shall not determine rigor truly".split())
|
||||||
tokenizer.save_pretrained(self.tmpdirname)
|
tokenizer.save_pretrained(cls.tmpdirname)
|
||||||
|
|
||||||
def get_tokenizer(self, **kwargs) -> SplinterTokenizer:
|
@classmethod
|
||||||
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
|
@use_cache_if_possible
|
||||||
|
@lru_cache(maxsize=64)
|
||||||
|
def get_tokenizer(cls, pretrained_name=None, **kwargs) -> SplinterTokenizer:
|
||||||
|
pretrained_name = pretrained_name or cls.tmpdirname
|
||||||
|
return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
def get_rust_tokenizer(self, **kwargs) -> SplinterTokenizerFast:
|
@classmethod
|
||||||
return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
|
@use_cache_if_possible
|
||||||
|
@lru_cache(maxsize=64)
|
||||||
|
def get_rust_tokenizer(cls, pretrained_name=None, **kwargs) -> SplinterTokenizerFast:
|
||||||
|
pretrained_name = pretrained_name or cls.tmpdirname
|
||||||
|
return cls.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
# Copied from transformers.models.siglip.SiglipTokenizationTest.test_get_vocab
|
# Copied from transformers.models.siglip.SiglipTokenizationTest.test_get_vocab
|
||||||
def test_get_vocab(self):
|
def test_get_vocab(self):
|
||||||
|
@ -13,22 +13,31 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
|
from functools import lru_cache
|
||||||
|
|
||||||
from transformers import SqueezeBertTokenizer, SqueezeBertTokenizerFast
|
from transformers import SqueezeBertTokenizer, SqueezeBertTokenizerFast
|
||||||
from transformers.testing_utils import require_tokenizers, slow
|
from transformers.testing_utils import require_tokenizers, slow
|
||||||
|
|
||||||
from ..bert.test_tokenization_bert import BertTokenizationTest
|
from ...test_tokenization_common import use_cache_if_possible
|
||||||
|
|
||||||
|
# Avoid import `BertTokenizationTest` directly as it will run as `test_tokenization_squeezebert.py::BertTokenizationTest`
|
||||||
|
# together with `test_tokenization_bert.py::BertTokenizationTest`.
|
||||||
|
from ..bert import test_tokenization_bert
|
||||||
|
|
||||||
|
|
||||||
@require_tokenizers
|
@require_tokenizers
|
||||||
class SqueezeBertTokenizationTest(BertTokenizationTest):
|
class SqueezeBertTokenizationTest(test_tokenization_bert.BertTokenizationTest):
|
||||||
tokenizer_class = SqueezeBertTokenizer
|
tokenizer_class = SqueezeBertTokenizer
|
||||||
rust_tokenizer_class = SqueezeBertTokenizerFast
|
rust_tokenizer_class = SqueezeBertTokenizerFast
|
||||||
test_rust_tokenizer = True
|
test_rust_tokenizer = True
|
||||||
from_pretrained_id = "squeezebert/squeezebert-uncased"
|
from_pretrained_id = "squeezebert/squeezebert-uncased"
|
||||||
|
|
||||||
def get_rust_tokenizer(self, **kwargs):
|
@classmethod
|
||||||
return SqueezeBertTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
|
@use_cache_if_possible
|
||||||
|
@lru_cache(maxsize=64)
|
||||||
|
def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
|
||||||
|
pretrained_name = pretrained_name or cls.tmpdirname
|
||||||
|
return SqueezeBertTokenizerFast.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
@slow
|
@slow
|
||||||
def test_sequence_builders(self):
|
def test_sequence_builders(self):
|
||||||
|
@ -17,12 +17,13 @@ import os
|
|||||||
import re
|
import re
|
||||||
import tempfile
|
import tempfile
|
||||||
import unittest
|
import unittest
|
||||||
|
from functools import lru_cache
|
||||||
|
|
||||||
from transformers import SPIECE_UNDERLINE, AddedToken, BatchEncoding, T5Tokenizer, T5TokenizerFast
|
from transformers import SPIECE_UNDERLINE, AddedToken, BatchEncoding, T5Tokenizer, T5TokenizerFast
|
||||||
from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_seqio, require_tokenizers, slow
|
from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_seqio, require_tokenizers, slow
|
||||||
from transformers.utils import cached_property, is_tf_available, is_torch_available
|
from transformers.utils import cached_property, is_tf_available, is_torch_available
|
||||||
|
|
||||||
from ...test_tokenization_common import TokenizerTesterMixin
|
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
|
||||||
|
|
||||||
|
|
||||||
SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
|
SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
|
||||||
@ -44,12 +45,13 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
test_rust_tokenizer = True
|
test_rust_tokenizer = True
|
||||||
test_sentencepiece = True
|
test_sentencepiece = True
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
# We have a SentencePiece fixture for testing
|
# We have a SentencePiece fixture for testing
|
||||||
tokenizer = T5Tokenizer(SAMPLE_VOCAB)
|
tokenizer = T5Tokenizer(SAMPLE_VOCAB)
|
||||||
tokenizer.save_pretrained(self.tmpdirname)
|
tokenizer.save_pretrained(cls.tmpdirname)
|
||||||
|
|
||||||
def test_convert_token_and_id(self):
|
def test_convert_token_and_id(self):
|
||||||
"""Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
|
"""Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
|
||||||
@ -145,11 +147,19 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
def t5_base_tokenizer_fast(self):
|
def t5_base_tokenizer_fast(self):
|
||||||
return T5TokenizerFast.from_pretrained("google-t5/t5-base")
|
return T5TokenizerFast.from_pretrained("google-t5/t5-base")
|
||||||
|
|
||||||
def get_tokenizer(self, **kwargs) -> T5Tokenizer:
|
@classmethod
|
||||||
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
|
@use_cache_if_possible
|
||||||
|
@lru_cache(maxsize=64)
|
||||||
|
def get_tokenizer(cls, pretrained_name=None, **kwargs) -> T5Tokenizer:
|
||||||
|
pretrained_name = pretrained_name or cls.tmpdirname
|
||||||
|
return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
def get_rust_tokenizer(self, **kwargs) -> T5TokenizerFast:
|
@classmethod
|
||||||
return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
|
@use_cache_if_possible
|
||||||
|
@lru_cache(maxsize=64)
|
||||||
|
def get_rust_tokenizer(cls, pretrained_name=None, **kwargs) -> T5TokenizerFast:
|
||||||
|
pretrained_name = pretrained_name or cls.tmpdirname
|
||||||
|
return cls.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
def test_rust_and_python_full_tokenizers(self):
|
def test_rust_and_python_full_tokenizers(self):
|
||||||
if not self.test_rust_tokenizer:
|
if not self.test_rust_tokenizer:
|
||||||
@ -275,10 +285,10 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
added_tokens = [f"<extra_id_{i}>" for i in range(100)] + [AddedToken("<special>", lstrip=True)]
|
added_tokens = [f"<extra_id_{i}>" for i in range(100)] + [AddedToken("<special>", lstrip=True)]
|
||||||
|
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
|
tokenizer_r = self.get_rust_tokenizer(
|
||||||
pretrained_name, additional_special_tokens=added_tokens, **kwargs
|
pretrained_name, additional_special_tokens=added_tokens, **kwargs
|
||||||
)
|
)
|
||||||
tokenizer_cr = self.rust_tokenizer_class.from_pretrained(
|
tokenizer_cr = self.get_rust_tokenizer(
|
||||||
pretrained_name, additional_special_tokens=added_tokens, **kwargs, from_slow=True
|
pretrained_name, additional_special_tokens=added_tokens, **kwargs, from_slow=True
|
||||||
)
|
)
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(
|
tokenizer_p = self.tokenizer_class.from_pretrained(
|
||||||
@ -460,10 +470,8 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
EXPECTED_WITH_SPACE = [9459, 149, 33, 25, 692, 1]
|
EXPECTED_WITH_SPACE = [9459, 149, 33, 25, 692, 1]
|
||||||
EXPECTED_WO_SPACE = [3845, 63, 149, 33, 25, 692, 1]
|
EXPECTED_WO_SPACE = [3845, 63, 149, 33, 25, 692, 1]
|
||||||
|
|
||||||
slow_ = self.tokenizer_class.from_pretrained(pretrained_name, add_prefix_space=False, legacy=False)
|
slow_ = self.get_tokenizer(pretrained_name, add_prefix_space=False, legacy=False)
|
||||||
fast_ = self.rust_tokenizer_class.from_pretrained(
|
fast_ = self.get_rust_tokenizer(pretrained_name, add_prefix_space=False, legacy=False, from_slow=True)
|
||||||
pretrained_name, add_prefix_space=False, legacy=False, from_slow=True
|
|
||||||
)
|
|
||||||
self.assertEqual(slow_.encode(inputs), EXPECTED_WO_SPACE)
|
self.assertEqual(slow_.encode(inputs), EXPECTED_WO_SPACE)
|
||||||
self.assertEqual(slow_.encode(inputs), fast_.encode(inputs))
|
self.assertEqual(slow_.encode(inputs), fast_.encode(inputs))
|
||||||
self.assertEqual(slow_.tokenize(inputs), ["He", "y", "▁how", "▁are", "▁you", "▁doing"])
|
self.assertEqual(slow_.tokenize(inputs), ["He", "y", "▁how", "▁are", "▁you", "▁doing"])
|
||||||
@ -473,8 +481,8 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
fast_.decode(EXPECTED_WO_SPACE, skip_special_tokens=True),
|
fast_.decode(EXPECTED_WO_SPACE, skip_special_tokens=True),
|
||||||
)
|
)
|
||||||
|
|
||||||
slow_ = self.tokenizer_class.from_pretrained(pretrained_name, add_prefix_space=True, legacy=False)
|
slow_ = self.get_tokenizer(pretrained_name, add_prefix_space=True, legacy=False)
|
||||||
fast_ = self.rust_tokenizer_class.from_pretrained(pretrained_name, add_prefix_space=True, legacy=False)
|
fast_ = self.get_rust_tokenizer(pretrained_name, add_prefix_space=True, legacy=False)
|
||||||
self.assertEqual(slow_.encode(inputs), EXPECTED_WITH_SPACE)
|
self.assertEqual(slow_.encode(inputs), EXPECTED_WITH_SPACE)
|
||||||
self.assertEqual(slow_.encode(inputs), fast_.encode(inputs))
|
self.assertEqual(slow_.encode(inputs), fast_.encode(inputs))
|
||||||
self.assertEqual(slow_.tokenize(inputs), ["▁Hey", "▁how", "▁are", "▁you", "▁doing"])
|
self.assertEqual(slow_.tokenize(inputs), ["▁Hey", "▁how", "▁are", "▁you", "▁doing"])
|
||||||
|
@ -112,8 +112,9 @@ class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
|
|
||||||
return output_txt, output_ids
|
return output_txt, output_ids
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
vocab_tokens = [
|
vocab_tokens = [
|
||||||
"[UNK]",
|
"[UNK]",
|
||||||
@ -132,8 +133,8 @@ class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
"low",
|
"low",
|
||||||
"lowest",
|
"lowest",
|
||||||
]
|
]
|
||||||
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
||||||
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
|
with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
|
||||||
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
|
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
|
||||||
|
|
||||||
def get_input_output_texts(self, tokenizer):
|
def get_input_output_texts(self, tokenizer):
|
||||||
@ -352,7 +353,7 @@ class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
def test_offsets_with_special_characters(self):
|
def test_offsets_with_special_characters(self):
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence."
|
sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence."
|
||||||
tokens = tokenizer_r.encode_plus(
|
tokens = tokenizer_r.encode_plus(
|
||||||
|
@ -93,12 +93,13 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
|
|
||||||
return questions, words, boxes
|
return questions, words, boxes
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
# We have a SentencePiece fixture for testing
|
# We have a SentencePiece fixture for testing
|
||||||
tokenizer = UdopTokenizer(SAMPLE_VOCAB, keep_accents=True)
|
tokenizer = UdopTokenizer(SAMPLE_VOCAB, keep_accents=True)
|
||||||
tokenizer.save_pretrained(self.tmpdirname)
|
tokenizer.save_pretrained(cls.tmpdirname)
|
||||||
|
|
||||||
def get_input_output_texts(self, tokenizer):
|
def get_input_output_texts(self, tokenizer):
|
||||||
input_text = "UNwant\u00e9d,running"
|
input_text = "UNwant\u00e9d,running"
|
||||||
@ -456,8 +457,8 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
def test_padding(self, max_length=50):
|
def test_padding(self, max_length=50):
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
|
self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
|
||||||
pad_token_id = tokenizer_p.pad_token_id
|
pad_token_id = tokenizer_p.pad_token_id
|
||||||
@ -922,8 +923,8 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
|
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
# Input tokens id
|
# Input tokens id
|
||||||
words, boxes = self.get_words_and_boxes()
|
words, boxes = self.get_words_and_boxes()
|
||||||
@ -1109,7 +1110,7 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
def test_offsets_mapping(self):
|
def test_offsets_mapping(self):
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
text = ["a", "wonderful", "test"]
|
text = ["a", "wonderful", "test"]
|
||||||
boxes = [[1, 8, 12, 20] for _ in range(len(text))]
|
boxes = [[1, 8, 12, 20] for _ in range(len(text))]
|
||||||
@ -1239,8 +1240,8 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
|
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
words, boxes = self.get_words_and_boxes()
|
words, boxes = self.get_words_and_boxes()
|
||||||
|
|
||||||
@ -1293,8 +1294,8 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
|
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
words, boxes = self.get_words_and_boxes()
|
words, boxes = self.get_words_and_boxes()
|
||||||
tokens_r = tokenizer_r.encode_plus_boxes(
|
tokens_r = tokenizer_r.encode_plus_boxes(
|
||||||
words,
|
words,
|
||||||
@ -1320,7 +1321,7 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
def test_compare_add_special_tokens(self):
|
def test_compare_add_special_tokens(self):
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
simple_num_special_tokens_to_add = tokenizer_r.num_special_tokens_to_add(pair=False)
|
simple_num_special_tokens_to_add = tokenizer_r.num_special_tokens_to_add(pair=False)
|
||||||
|
|
||||||
@ -1402,7 +1403,7 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
added_tokens = [AddedToken("<special>", lstrip=True)]
|
added_tokens = [AddedToken("<special>", lstrip=True)]
|
||||||
|
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
|
tokenizer_r = self.get_rust_tokenizer(
|
||||||
pretrained_name, additional_special_tokens=added_tokens, **kwargs
|
pretrained_name, additional_special_tokens=added_tokens, **kwargs
|
||||||
)
|
)
|
||||||
words = "Hey this is a <special> token".split()
|
words = "Hey this is a <special> token".split()
|
||||||
@ -1416,7 +1417,7 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
self.assertTrue(special_token_id in r_output)
|
self.assertTrue(special_token_id in r_output)
|
||||||
|
|
||||||
if self.test_slow_tokenizer:
|
if self.test_slow_tokenizer:
|
||||||
tokenizer_cr = self.rust_tokenizer_class.from_pretrained(
|
tokenizer_cr = self.get_rust_tokenizer(
|
||||||
pretrained_name, additional_special_tokens=added_tokens, **kwargs, from_slow=True
|
pretrained_name, additional_special_tokens=added_tokens, **kwargs, from_slow=True
|
||||||
)
|
)
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(
|
tokenizer_p = self.tokenizer_class.from_pretrained(
|
||||||
@ -1591,8 +1592,8 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
|
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
|
self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
|
||||||
pad_token_id = tokenizer_p.pad_token_id
|
pad_token_id = tokenizer_p.pad_token_id
|
||||||
|
|
||||||
|
@ -19,12 +19,13 @@ import os
|
|||||||
import shutil
|
import shutil
|
||||||
import tempfile
|
import tempfile
|
||||||
import unittest
|
import unittest
|
||||||
|
from functools import lru_cache
|
||||||
|
|
||||||
from transformers import VitsTokenizer
|
from transformers import VitsTokenizer
|
||||||
from transformers.models.vits.tokenization_vits import VOCAB_FILES_NAMES
|
from transformers.models.vits.tokenization_vits import VOCAB_FILES_NAMES
|
||||||
from transformers.testing_utils import slow
|
from transformers.testing_utils import slow
|
||||||
|
|
||||||
from ...test_tokenization_common import TokenizerTesterMixin
|
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
|
||||||
|
|
||||||
|
|
||||||
class VitsTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
|
class VitsTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
|
||||||
@ -32,8 +33,9 @@ class VitsTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
tokenizer_class = VitsTokenizer
|
tokenizer_class = VitsTokenizer
|
||||||
test_rust_tokenizer = False
|
test_rust_tokenizer = False
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
vocab = (
|
vocab = (
|
||||||
"k ' z y u d h e s w – 3 c p - 1 j m i X f l o 0 b r a 4 2 n _ x v t q 5 6 g ț ţ < > | <pad> <unk>".split(
|
"k ' z y u d h e s w – 3 c p - 1 j m i X f l o 0 b r a 4 2 n _ x v t q 5 6 g ț ţ < > | <pad> <unk>".split(
|
||||||
@ -44,18 +46,22 @@ class VitsTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
vocab_tokens[" "] = vocab_tokens["X"]
|
vocab_tokens[" "] = vocab_tokens["X"]
|
||||||
del vocab_tokens["X"]
|
del vocab_tokens["X"]
|
||||||
|
|
||||||
self.special_tokens_map = {"pad_token": "<pad>", "unk_token": "<unk>"}
|
cls.special_tokens_map = {"pad_token": "<pad>", "unk_token": "<unk>"}
|
||||||
|
|
||||||
self.tmpdirname = tempfile.mkdtemp()
|
cls.tmpdirname = tempfile.mkdtemp()
|
||||||
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
||||||
with open(self.vocab_file, "w", encoding="utf-8") as fp:
|
with open(cls.vocab_file, "w", encoding="utf-8") as fp:
|
||||||
fp.write(json.dumps(vocab_tokens) + "\n")
|
fp.write(json.dumps(vocab_tokens) + "\n")
|
||||||
|
|
||||||
def get_tokenizer(self, **kwargs):
|
@classmethod
|
||||||
kwargs.update(self.special_tokens_map)
|
@use_cache_if_possible
|
||||||
|
@lru_cache(maxsize=64)
|
||||||
|
def get_tokenizer(cls, pretrained_name=None, **kwargs):
|
||||||
|
kwargs.update(cls.special_tokens_map)
|
||||||
kwargs["phonemize"] = False
|
kwargs["phonemize"] = False
|
||||||
kwargs["normalize"] = False
|
kwargs["normalize"] = False
|
||||||
return VitsTokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
pretrained_name = pretrained_name or cls.tmpdirname
|
||||||
|
return VitsTokenizer.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
def get_clean_sequence(self, tokenizer, with_prefix_space=False, max_length=20, min_length=5):
|
def get_clean_sequence(self, tokenizer, with_prefix_space=False, max_length=20, min_length=5):
|
||||||
txt = "beyonce lives in los angeles"
|
txt = "beyonce lives in los angeles"
|
||||||
|
@ -21,6 +21,7 @@ import random
|
|||||||
import shutil
|
import shutil
|
||||||
import tempfile
|
import tempfile
|
||||||
import unittest
|
import unittest
|
||||||
|
from functools import lru_cache
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
@ -33,7 +34,7 @@ from transformers import (
|
|||||||
from transformers.models.wav2vec2.tokenization_wav2vec2 import VOCAB_FILES_NAMES, Wav2Vec2CTCTokenizerOutput
|
from transformers.models.wav2vec2.tokenization_wav2vec2 import VOCAB_FILES_NAMES, Wav2Vec2CTCTokenizerOutput
|
||||||
from transformers.testing_utils import require_torch, slow
|
from transformers.testing_utils import require_torch, slow
|
||||||
|
|
||||||
from ...test_tokenization_common import TokenizerTesterMixin
|
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
|
||||||
|
|
||||||
|
|
||||||
global_rng = random.Random()
|
global_rng = random.Random()
|
||||||
@ -57,22 +58,27 @@ def floats_list(shape, scale=1.0, rng=None, name=None):
|
|||||||
class Wav2Vec2TokenizerTest(unittest.TestCase):
|
class Wav2Vec2TokenizerTest(unittest.TestCase):
|
||||||
tokenizer_class = Wav2Vec2Tokenizer
|
tokenizer_class = Wav2Vec2Tokenizer
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
vocab = "<pad> <s> </s> <unk> | E T A O N I H S R D L U M W C F G Y P B V K ' X J Q Z".split(" ")
|
vocab = "<pad> <s> </s> <unk> | E T A O N I H S R D L U M W C F G Y P B V K ' X J Q Z".split(" ")
|
||||||
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
||||||
|
|
||||||
self.special_tokens_map = {"pad_token": "<pad>", "unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"}
|
cls.special_tokens_map = {"pad_token": "<pad>", "unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"}
|
||||||
|
|
||||||
self.tmpdirname = tempfile.mkdtemp()
|
cls.tmpdirname = tempfile.mkdtemp()
|
||||||
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
||||||
with open(self.vocab_file, "w", encoding="utf-8") as fp:
|
with open(cls.vocab_file, "w", encoding="utf-8") as fp:
|
||||||
fp.write(json.dumps(vocab_tokens) + "\n")
|
fp.write(json.dumps(vocab_tokens) + "\n")
|
||||||
|
|
||||||
def get_tokenizer(self, **kwargs):
|
@classmethod
|
||||||
kwargs.update(self.special_tokens_map)
|
@use_cache_if_possible
|
||||||
return Wav2Vec2Tokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
@lru_cache(maxsize=64)
|
||||||
|
def get_tokenizer(cls, pretrained_name=None, **kwargs):
|
||||||
|
kwargs.update(cls.special_tokens_map)
|
||||||
|
pretrained_name = pretrained_name or cls.tmpdirname
|
||||||
|
return Wav2Vec2Tokenizer.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
def test_tokenizer_decode(self):
|
def test_tokenizer_decode(self):
|
||||||
# TODO(PVP) - change to facebook
|
# TODO(PVP) - change to facebook
|
||||||
@ -237,7 +243,7 @@ class Wav2Vec2TokenizerTest(unittest.TestCase):
|
|||||||
|
|
||||||
def test_save_pretrained(self):
|
def test_save_pretrained(self):
|
||||||
pretrained_name = list(self.tokenizer_class.pretrained_vocab_files_map["vocab_file"].keys())[0]
|
pretrained_name = list(self.tokenizer_class.pretrained_vocab_files_map["vocab_file"].keys())[0]
|
||||||
tokenizer = self.tokenizer_class.from_pretrained(pretrained_name)
|
tokenizer = self.get_tokenizer(pretrained_name)
|
||||||
tmpdirname2 = tempfile.mkdtemp()
|
tmpdirname2 = tempfile.mkdtemp()
|
||||||
|
|
||||||
tokenizer_files = tokenizer.save_pretrained(tmpdirname2)
|
tokenizer_files = tokenizer.save_pretrained(tmpdirname2)
|
||||||
@ -373,22 +379,27 @@ class Wav2Vec2CTCTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
tokenizer_class = Wav2Vec2CTCTokenizer
|
tokenizer_class = Wav2Vec2CTCTokenizer
|
||||||
test_rust_tokenizer = False
|
test_rust_tokenizer = False
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
vocab = "<pad> <s> </s> <unk> | E T A O N I H S R D L U M W C F G Y P B V K ' X J Q Z".split(" ")
|
vocab = "<pad> <s> </s> <unk> | E T A O N I H S R D L U M W C F G Y P B V K ' X J Q Z".split(" ")
|
||||||
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
||||||
|
|
||||||
self.special_tokens_map = {"pad_token": "<pad>", "unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"}
|
cls.special_tokens_map = {"pad_token": "<pad>", "unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"}
|
||||||
|
|
||||||
self.tmpdirname = tempfile.mkdtemp()
|
cls.tmpdirname = tempfile.mkdtemp()
|
||||||
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
||||||
with open(self.vocab_file, "w", encoding="utf-8") as fp:
|
with open(cls.vocab_file, "w", encoding="utf-8") as fp:
|
||||||
fp.write(json.dumps(vocab_tokens) + "\n")
|
fp.write(json.dumps(vocab_tokens) + "\n")
|
||||||
|
|
||||||
def get_tokenizer(self, **kwargs):
|
@classmethod
|
||||||
kwargs.update(self.special_tokens_map)
|
@use_cache_if_possible
|
||||||
return Wav2Vec2CTCTokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
@lru_cache(maxsize=64)
|
||||||
|
def get_tokenizer(cls, pretrained_name=None, **kwargs):
|
||||||
|
kwargs.update(cls.special_tokens_map)
|
||||||
|
pretrained_name = pretrained_name or cls.tmpdirname
|
||||||
|
return Wav2Vec2CTCTokenizer.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
def test_tokenizer_add_token_chars(self):
|
def test_tokenizer_add_token_chars(self):
|
||||||
tokenizer = self.tokenizer_class.from_pretrained("facebook/wav2vec2-base-960h")
|
tokenizer = self.tokenizer_class.from_pretrained("facebook/wav2vec2-base-960h")
|
||||||
|
@ -17,6 +17,7 @@
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import unittest
|
import unittest
|
||||||
|
from functools import lru_cache
|
||||||
from typing import Tuple
|
from typing import Tuple
|
||||||
|
|
||||||
from transformers import Wav2Vec2PhonemeCTCTokenizer
|
from transformers import Wav2Vec2PhonemeCTCTokenizer
|
||||||
@ -24,7 +25,7 @@ from transformers.models.wav2vec2.tokenization_wav2vec2 import VOCAB_FILES_NAMES
|
|||||||
from transformers.models.wav2vec2_phoneme.tokenization_wav2vec2_phoneme import Wav2Vec2PhonemeCTCTokenizerOutput
|
from transformers.models.wav2vec2_phoneme.tokenization_wav2vec2_phoneme import Wav2Vec2PhonemeCTCTokenizerOutput
|
||||||
from transformers.testing_utils import require_phonemizer
|
from transformers.testing_utils import require_phonemizer
|
||||||
|
|
||||||
from ...test_tokenization_common import TokenizerTesterMixin
|
from ...test_tokenization_common import TokenizerTesterMixin, use_cache_if_possible
|
||||||
|
|
||||||
|
|
||||||
@require_phonemizer
|
@require_phonemizer
|
||||||
@ -33,8 +34,9 @@ class Wav2Vec2PhonemeCTCTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
tokenizer_class = Wav2Vec2PhonemeCTCTokenizer
|
tokenizer_class = Wav2Vec2PhonemeCTCTokenizer
|
||||||
test_rust_tokenizer = False
|
test_rust_tokenizer = False
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
vocab = (
|
vocab = (
|
||||||
"<s> <pad> </s> <unk> n s t ə l a i k d m ɛ ɾ e ɪ p o ɐ z ð f j v b ɹ ʁ ʊ iː r w ʌ u ɡ æ aɪ ʃ h ɔ ɑː "
|
"<s> <pad> </s> <unk> n s t ə l a i k d m ɛ ɾ e ɪ p o ɐ z ð f j v b ɹ ʁ ʊ iː r w ʌ u ɡ æ aɪ ʃ h ɔ ɑː "
|
||||||
@ -53,10 +55,10 @@ class Wav2Vec2PhonemeCTCTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
).split(" ")
|
).split(" ")
|
||||||
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
||||||
|
|
||||||
self.special_tokens_map = {"pad_token": "<pad>", "unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"}
|
cls.special_tokens_map = {"pad_token": "<pad>", "unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"}
|
||||||
|
|
||||||
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
||||||
with open(self.vocab_file, "w", encoding="utf-8") as fp:
|
with open(cls.vocab_file, "w", encoding="utf-8") as fp:
|
||||||
fp.write(json.dumps(vocab_tokens) + "\n")
|
fp.write(json.dumps(vocab_tokens) + "\n")
|
||||||
|
|
||||||
# overwrite since phonemes require specific creation
|
# overwrite since phonemes require specific creation
|
||||||
@ -84,9 +86,13 @@ class Wav2Vec2PhonemeCTCTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
output_ids = tokenizer.encode(output_txt, add_special_tokens=False)
|
output_ids = tokenizer.encode(output_txt, add_special_tokens=False)
|
||||||
return output_txt, output_ids
|
return output_txt, output_ids
|
||||||
|
|
||||||
def get_tokenizer(self, **kwargs):
|
@classmethod
|
||||||
kwargs.update(self.special_tokens_map)
|
@use_cache_if_possible
|
||||||
return Wav2Vec2PhonemeCTCTokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
@lru_cache(maxsize=64)
|
||||||
|
def get_tokenizer(cls, pretrained_name=None, **kwargs):
|
||||||
|
kwargs.update(cls.special_tokens_map)
|
||||||
|
pretrained_name = pretrained_name or cls.tmpdirname
|
||||||
|
return Wav2Vec2PhonemeCTCTokenizer.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
def test_tokenizer_add_new_tokens(self):
|
def test_tokenizer_add_new_tokens(self):
|
||||||
tokenizer = self.tokenizer_class.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft")
|
tokenizer = self.tokenizer_class.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft")
|
||||||
|
@ -40,12 +40,13 @@ class WhisperTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
test_sentencepiece = False
|
test_sentencepiece = False
|
||||||
test_seq2seq = False
|
test_seq2seq = False
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny")
|
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny")
|
||||||
tokenizer.pad_token_id = 50256
|
tokenizer.pad_token_id = 50256
|
||||||
tokenizer.pad_token = "<|endoftext|>"
|
tokenizer.pad_token = "<|endoftext|>"
|
||||||
tokenizer.save_pretrained(self.tmpdirname)
|
tokenizer.save_pretrained(cls.tmpdirname)
|
||||||
|
|
||||||
def test_convert_token_and_id(self):
|
def test_convert_token_and_id(self):
|
||||||
"""Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
|
"""Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
|
||||||
|
@ -37,12 +37,13 @@ class XGLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
test_rust_tokenizer = True
|
test_rust_tokenizer = True
|
||||||
test_sentencepiece = True
|
test_sentencepiece = True
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
# We have a SentencePiece fixture for testing
|
# We have a SentencePiece fixture for testing
|
||||||
tokenizer = XGLMTokenizer(SAMPLE_VOCAB, keep_accents=True)
|
tokenizer = XGLMTokenizer(SAMPLE_VOCAB, keep_accents=True)
|
||||||
tokenizer.save_pretrained(self.tmpdirname)
|
tokenizer.save_pretrained(cls.tmpdirname)
|
||||||
|
|
||||||
def test_convert_token_and_id(self):
|
def test_convert_token_and_id(self):
|
||||||
"""Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
|
"""Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
|
||||||
|
@ -29,8 +29,9 @@ class XLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
tokenizer_class = XLMTokenizer
|
tokenizer_class = XLMTokenizer
|
||||||
test_rust_tokenizer = False
|
test_rust_tokenizer = False
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
|
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
|
||||||
vocab = [
|
vocab = [
|
||||||
@ -59,11 +60,11 @@ class XLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
||||||
merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""]
|
merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""]
|
||||||
|
|
||||||
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
||||||
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
|
cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
|
||||||
with open(self.vocab_file, "w") as fp:
|
with open(cls.vocab_file, "w") as fp:
|
||||||
fp.write(json.dumps(vocab_tokens))
|
fp.write(json.dumps(vocab_tokens))
|
||||||
with open(self.merges_file, "w") as fp:
|
with open(cls.merges_file, "w") as fp:
|
||||||
fp.write("\n".join(merges))
|
fp.write("\n".join(merges))
|
||||||
|
|
||||||
def get_input_output_texts(self, tokenizer):
|
def get_input_output_texts(self, tokenizer):
|
||||||
|
@ -37,12 +37,13 @@ class XLMRobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
test_rust_tokenizer = True
|
test_rust_tokenizer = True
|
||||||
test_sentencepiece = True
|
test_sentencepiece = True
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
# We have a SentencePiece fixture for testing
|
# We have a SentencePiece fixture for testing
|
||||||
tokenizer = XLMRobertaTokenizer(SAMPLE_VOCAB, keep_accents=True)
|
tokenizer = XLMRobertaTokenizer(SAMPLE_VOCAB, keep_accents=True)
|
||||||
tokenizer.save_pretrained(self.tmpdirname)
|
tokenizer.save_pretrained(cls.tmpdirname)
|
||||||
|
|
||||||
def test_convert_token_and_id(self):
|
def test_convert_token_and_id(self):
|
||||||
"""Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
|
"""Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
|
||||||
@ -148,8 +149,8 @@ class XLMRobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
self.tokenizers_list[0] = (self.rust_tokenizer_class, "hf-internal-testing/tiny-xlm-roberta", {})
|
self.tokenizers_list[0] = (self.rust_tokenizer_class, "hf-internal-testing/tiny-xlm-roberta", {})
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
tmpdirname2 = tempfile.mkdtemp()
|
tmpdirname2 = tempfile.mkdtemp()
|
||||||
|
|
||||||
|
@ -33,12 +33,13 @@ class XLNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
test_rust_tokenizer = True
|
test_rust_tokenizer = True
|
||||||
test_sentencepiece = True
|
test_sentencepiece = True
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
super().setUp()
|
def setUpClass(cls):
|
||||||
|
super().setUpClass()
|
||||||
|
|
||||||
# We have a SentencePiece fixture for testing
|
# We have a SentencePiece fixture for testing
|
||||||
tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
|
tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
|
||||||
tokenizer.save_pretrained(self.tmpdirname)
|
tokenizer.save_pretrained(cls.tmpdirname)
|
||||||
|
|
||||||
def test_convert_token_and_id(self):
|
def test_convert_token_and_id(self):
|
||||||
"""Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
|
"""Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
|
||||||
|
@ -13,6 +13,8 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
|
import copy
|
||||||
|
import functools
|
||||||
import inspect
|
import inspect
|
||||||
import itertools
|
import itertools
|
||||||
import json
|
import json
|
||||||
@ -24,6 +26,7 @@ import tempfile
|
|||||||
import traceback
|
import traceback
|
||||||
import unittest
|
import unittest
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
|
from functools import lru_cache
|
||||||
from itertools import takewhile
|
from itertools import takewhile
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Union
|
from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Union
|
||||||
@ -69,6 +72,38 @@ if TYPE_CHECKING:
|
|||||||
from transformers import PretrainedConfig, PreTrainedModel, TFPreTrainedModel
|
from transformers import PretrainedConfig, PreTrainedModel, TFPreTrainedModel
|
||||||
|
|
||||||
|
|
||||||
|
def use_cache_if_possible(func):
|
||||||
|
@functools.wraps(func)
|
||||||
|
def wrapper(*args, **kwargs):
|
||||||
|
use_cache = kwargs.pop("use_cache", True)
|
||||||
|
|
||||||
|
underline_func = func
|
||||||
|
if "functools" in str(func):
|
||||||
|
underline_func = func.__wrapped__
|
||||||
|
|
||||||
|
if not use_cache:
|
||||||
|
return underline_func(*args, **kwargs)
|
||||||
|
if any(not arg.__hash__ for arg in args):
|
||||||
|
return underline_func(*args, **kwargs)
|
||||||
|
elif any(not kwarg.__hash__ for kwarg in kwargs.values()):
|
||||||
|
return underline_func(*args, **kwargs)
|
||||||
|
|
||||||
|
cached = func(*args, **kwargs)
|
||||||
|
copied = copy.deepcopy(cached)
|
||||||
|
|
||||||
|
if hasattr(copied, "_tokenizer") and "tests.models.clip.test_tokenization_clip.CLIPTokenizationTest" in str(
|
||||||
|
args[0]
|
||||||
|
):
|
||||||
|
copied._tokenizer = cached._tokenizer
|
||||||
|
|
||||||
|
if hasattr(copied, "sp_model"):
|
||||||
|
copied.sp_model = cached.sp_model
|
||||||
|
|
||||||
|
return copied
|
||||||
|
|
||||||
|
return wrapper
|
||||||
|
|
||||||
|
|
||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
NON_ENGLISH_TAGS = ["chinese", "dutch", "french", "finnish", "german", "multilingual"]
|
NON_ENGLISH_TAGS = ["chinese", "dutch", "french", "finnish", "german", "multilingual"]
|
||||||
@ -198,32 +233,34 @@ class TokenizerTesterMixin:
|
|||||||
# test_sentencepiece must also be set to True
|
# test_sentencepiece must also be set to True
|
||||||
test_sentencepiece_ignore_case = False
|
test_sentencepiece_ignore_case = False
|
||||||
|
|
||||||
def setUp(self) -> None:
|
@classmethod
|
||||||
|
def setUpClass(cls) -> None:
|
||||||
# Tokenizer.filter makes it possible to filter which Tokenizer to case based on all the
|
# Tokenizer.filter makes it possible to filter which Tokenizer to case based on all the
|
||||||
# information available in Tokenizer (name, rust class, python class, vocab key name)
|
# information available in Tokenizer (name, rust class, python class, vocab key name)
|
||||||
self.from_pretrained_id = (
|
cls.from_pretrained_id = (
|
||||||
[self.from_pretrained_id] if isinstance(self.from_pretrained_id, str) else self.from_pretrained_id
|
[cls.from_pretrained_id] if isinstance(cls.from_pretrained_id, str) else cls.from_pretrained_id
|
||||||
)
|
)
|
||||||
|
|
||||||
self.tokenizers_list = []
|
cls.tokenizers_list = []
|
||||||
if self.test_rust_tokenizer:
|
if cls.test_rust_tokenizer:
|
||||||
self.tokenizers_list = [
|
cls.tokenizers_list = [
|
||||||
(
|
(
|
||||||
self.rust_tokenizer_class,
|
cls.rust_tokenizer_class,
|
||||||
pretrained_id,
|
pretrained_id,
|
||||||
self.from_pretrained_kwargs if self.from_pretrained_kwargs is not None else {},
|
cls.from_pretrained_kwargs if cls.from_pretrained_kwargs is not None else {},
|
||||||
)
|
)
|
||||||
for pretrained_id in self.from_pretrained_id
|
for pretrained_id in cls.from_pretrained_id
|
||||||
]
|
]
|
||||||
else:
|
else:
|
||||||
self.tokenizers_list = []
|
cls.tokenizers_list = []
|
||||||
with open(f"{get_tests_dir()}/fixtures/sample_text.txt", encoding="utf-8") as f_data:
|
with open(f"{get_tests_dir()}/fixtures/sample_text.txt", encoding="utf-8") as f_data:
|
||||||
self._data = f_data.read().replace("\n\n", "\n").strip()
|
cls._data = f_data.read().replace("\n\n", "\n").strip()
|
||||||
|
|
||||||
self.tmpdirname = tempfile.mkdtemp()
|
cls.tmpdirname = tempfile.mkdtemp()
|
||||||
|
|
||||||
def tearDown(self):
|
@classmethod
|
||||||
shutil.rmtree(self.tmpdirname)
|
def tearDownClass(cls):
|
||||||
|
shutil.rmtree(cls.tmpdirname)
|
||||||
|
|
||||||
def get_input_output_texts(self, tokenizer):
|
def get_input_output_texts(self, tokenizer):
|
||||||
input_txt = self.get_clean_sequence(tokenizer)[0]
|
input_txt = self.get_clean_sequence(tokenizer)[0]
|
||||||
@ -267,11 +304,19 @@ class TokenizerTesterMixin:
|
|||||||
else:
|
else:
|
||||||
raise ValueError("This tokenizer class has no tokenizer to be tested.")
|
raise ValueError("This tokenizer class has no tokenizer to be tested.")
|
||||||
|
|
||||||
def get_tokenizer(self, **kwargs) -> PreTrainedTokenizer:
|
@classmethod
|
||||||
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
|
@use_cache_if_possible
|
||||||
|
@lru_cache(maxsize=64)
|
||||||
|
def get_tokenizer(cls, pretrained_name=None, **kwargs) -> PreTrainedTokenizer:
|
||||||
|
pretrained_name = pretrained_name or cls.tmpdirname
|
||||||
|
return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
def get_rust_tokenizer(self, **kwargs) -> PreTrainedTokenizerFast:
|
@classmethod
|
||||||
return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
|
@use_cache_if_possible
|
||||||
|
@lru_cache(maxsize=64)
|
||||||
|
def get_rust_tokenizer(cls, pretrained_name=None, **kwargs) -> PreTrainedTokenizerFast:
|
||||||
|
pretrained_name = pretrained_name or cls.tmpdirname
|
||||||
|
return cls.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
||||||
|
|
||||||
def tokenizer_integration_test_util(
|
def tokenizer_integration_test_util(
|
||||||
self,
|
self,
|
||||||
@ -1263,7 +1308,7 @@ class TokenizerTesterMixin:
|
|||||||
if not self.test_rust_tokenizer:
|
if not self.test_rust_tokenizer:
|
||||||
self.skipTest(reason="No fast tokenizer defined")
|
self.skipTest(reason="No fast tokenizer defined")
|
||||||
|
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name)
|
||||||
self._check_no_pad_token_padding(tokenizer_r, conversations)
|
self._check_no_pad_token_padding(tokenizer_r, conversations)
|
||||||
|
|
||||||
tokenizer_r.padding_side = "right"
|
tokenizer_r.padding_side = "right"
|
||||||
@ -1446,7 +1491,7 @@ class TokenizerTesterMixin:
|
|||||||
if not self.test_rust_tokenizer:
|
if not self.test_rust_tokenizer:
|
||||||
self.skipTest(reason="No fast tokenizer defined")
|
self.skipTest(reason="No fast tokenizer defined")
|
||||||
|
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name)
|
||||||
|
|
||||||
# Find where to truncate, as the amount of tokens is different for different tokenizers and I want the
|
# Find where to truncate, as the amount of tokens is different for different tokenizers and I want the
|
||||||
# truncation to happen in the middle of the assistant content.
|
# truncation to happen in the middle of the assistant content.
|
||||||
@ -2050,11 +2095,9 @@ class TokenizerTesterMixin:
|
|||||||
if self.rust_tokenizer_class is not None:
|
if self.rust_tokenizer_class is not None:
|
||||||
pretrained_name = self.from_pretrained_id
|
pretrained_name = self.from_pretrained_id
|
||||||
|
|
||||||
slow_tokenizer = self.tokenizer_class.from_pretrained(pretrained_name, legacy=False)
|
slow_tokenizer = self.get_tokenizer(pretrained_name, legacy=False)
|
||||||
with self.subTest(f"{pretrained_name}"):
|
with self.subTest(f"{pretrained_name}"):
|
||||||
rust_tokenizer = self.rust_tokenizer_class.from_pretrained(
|
rust_tokenizer = self.get_rust_tokenizer(pretrained_name, from_slow=True, legacy=False)
|
||||||
pretrained_name, from_slow=True, legacy=False
|
|
||||||
)
|
|
||||||
input_full_vocab_ids = list(
|
input_full_vocab_ids = list(
|
||||||
range(len(slow_tokenizer))
|
range(len(slow_tokenizer))
|
||||||
) # TODO let's maybe shuffle this! And run it 4 times. This way we cover more cmbinations
|
) # TODO let's maybe shuffle this! And run it 4 times. This way we cover more cmbinations
|
||||||
@ -2200,14 +2243,10 @@ class TokenizerTesterMixin:
|
|||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
if self.test_rust_tokenizer:
|
if self.test_rust_tokenizer:
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, padding_side="left", **kwargs)
|
||||||
pretrained_name, padding_side="left", **kwargs
|
|
||||||
)
|
|
||||||
self.assertEqual(tokenizer_r.padding_side, "left")
|
self.assertEqual(tokenizer_r.padding_side, "left")
|
||||||
|
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, padding_side="right", **kwargs)
|
||||||
pretrained_name, padding_side="right", **kwargs
|
|
||||||
)
|
|
||||||
self.assertEqual(tokenizer_r.padding_side, "right")
|
self.assertEqual(tokenizer_r.padding_side, "right")
|
||||||
|
|
||||||
self.assertRaises(
|
self.assertRaises(
|
||||||
@ -2219,10 +2258,10 @@ class TokenizerTesterMixin:
|
|||||||
)
|
)
|
||||||
|
|
||||||
if self.test_slow_tokenizer:
|
if self.test_slow_tokenizer:
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, padding_side="left", **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, padding_side="left", **kwargs)
|
||||||
self.assertEqual(tokenizer_p.padding_side, "left")
|
self.assertEqual(tokenizer_p.padding_side, "left")
|
||||||
|
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, padding_side="right", **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, padding_side="right", **kwargs)
|
||||||
self.assertEqual(tokenizer_p.padding_side, "right")
|
self.assertEqual(tokenizer_p.padding_side, "right")
|
||||||
|
|
||||||
self.assertRaises(
|
self.assertRaises(
|
||||||
@ -2237,14 +2276,10 @@ class TokenizerTesterMixin:
|
|||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
if self.test_rust_tokenizer:
|
if self.test_rust_tokenizer:
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, truncation_side="left", **kwargs)
|
||||||
pretrained_name, truncation_side="left", **kwargs
|
|
||||||
)
|
|
||||||
self.assertEqual(tokenizer_r.truncation_side, "left")
|
self.assertEqual(tokenizer_r.truncation_side, "left")
|
||||||
|
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, truncation_side="right", **kwargs)
|
||||||
pretrained_name, truncation_side="right", **kwargs
|
|
||||||
)
|
|
||||||
self.assertEqual(tokenizer_r.truncation_side, "right")
|
self.assertEqual(tokenizer_r.truncation_side, "right")
|
||||||
|
|
||||||
self.assertRaises(
|
self.assertRaises(
|
||||||
@ -2256,14 +2291,10 @@ class TokenizerTesterMixin:
|
|||||||
)
|
)
|
||||||
|
|
||||||
if self.test_slow_tokenizer:
|
if self.test_slow_tokenizer:
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(
|
tokenizer_p = self.get_tokenizer(pretrained_name, truncation_side="left", **kwargs)
|
||||||
pretrained_name, truncation_side="left", **kwargs
|
|
||||||
)
|
|
||||||
self.assertEqual(tokenizer_p.truncation_side, "left")
|
self.assertEqual(tokenizer_p.truncation_side, "left")
|
||||||
|
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(
|
tokenizer_p = self.get_tokenizer(pretrained_name, truncation_side="right", **kwargs)
|
||||||
pretrained_name, truncation_side="right", **kwargs
|
|
||||||
)
|
|
||||||
self.assertEqual(tokenizer_p.truncation_side, "right")
|
self.assertEqual(tokenizer_p.truncation_side, "right")
|
||||||
|
|
||||||
self.assertRaises(
|
self.assertRaises(
|
||||||
@ -3194,18 +3225,18 @@ class TokenizerTesterMixin:
|
|||||||
def test_is_fast(self):
|
def test_is_fast(self):
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
# Check is_fast is set correctly
|
# Check is_fast is set correctly
|
||||||
self.assertTrue(tokenizer_r.is_fast)
|
self.assertTrue(tokenizer_r.is_fast)
|
||||||
|
|
||||||
if self.test_slow_tokenizer:
|
if self.test_slow_tokenizer:
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
self.assertFalse(tokenizer_p.is_fast)
|
self.assertFalse(tokenizer_p.is_fast)
|
||||||
|
|
||||||
def test_fast_only_inputs(self):
|
def test_fast_only_inputs(self):
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
# Ensure None raise an error
|
# Ensure None raise an error
|
||||||
self.assertRaises(TypeError, tokenizer_r.tokenize, None)
|
self.assertRaises(TypeError, tokenizer_r.tokenize, None)
|
||||||
@ -3216,7 +3247,7 @@ class TokenizerTesterMixin:
|
|||||||
def test_alignement_methods(self):
|
def test_alignement_methods(self):
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
words = ["Wonderful", "no", "inspiration", "example", "with", "subtoken"]
|
words = ["Wonderful", "no", "inspiration", "example", "with", "subtoken"]
|
||||||
text = " ".join(words)
|
text = " ".join(words)
|
||||||
@ -3446,8 +3477,8 @@ class TokenizerTesterMixin:
|
|||||||
|
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
# Ensure basic input match
|
# Ensure basic input match
|
||||||
input_p = tokenizer_p.encode_plus(self._data)
|
input_p = tokenizer_p.encode_plus(self._data)
|
||||||
@ -3487,8 +3518,8 @@ class TokenizerTesterMixin:
|
|||||||
|
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
# Check we have the same number of added_tokens for both pair and non-pair inputs.
|
# Check we have the same number of added_tokens for both pair and non-pair inputs.
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
@ -3505,8 +3536,8 @@ class TokenizerTesterMixin:
|
|||||||
|
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
# Check we have the correct max_length for both pair and non-pair inputs.
|
# Check we have the correct max_length for both pair and non-pair inputs.
|
||||||
self.assertEqual(tokenizer_r.max_len_single_sentence, tokenizer_p.max_len_single_sentence)
|
self.assertEqual(tokenizer_r.max_len_single_sentence, tokenizer_p.max_len_single_sentence)
|
||||||
@ -3520,8 +3551,8 @@ class TokenizerTesterMixin:
|
|||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
# sometimes the tokenizer saved online is not the same
|
# sometimes the tokenizer saved online is not the same
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
# Assert the set of special tokens match.
|
# Assert the set of special tokens match.
|
||||||
self.assertSequenceEqual(
|
self.assertSequenceEqual(
|
||||||
@ -3532,7 +3563,7 @@ class TokenizerTesterMixin:
|
|||||||
def test_add_tokens(self):
|
def test_add_tokens(self):
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
vocab_size = len(tokenizer_r)
|
vocab_size = len(tokenizer_r)
|
||||||
self.assertEqual(tokenizer_r.add_tokens(""), 0)
|
self.assertEqual(tokenizer_r.add_tokens(""), 0)
|
||||||
@ -3558,7 +3589,7 @@ class TokenizerTesterMixin:
|
|||||||
def test_offsets_mapping(self):
|
def test_offsets_mapping(self):
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
text = "Wonderful no inspiration example with subtoken"
|
text = "Wonderful no inspiration example with subtoken"
|
||||||
pair = "Along with an awesome pair"
|
pair = "Along with an awesome pair"
|
||||||
@ -3601,7 +3632,7 @@ class TokenizerTesterMixin:
|
|||||||
This needs to be padded so that it can represented as a tensor
|
This needs to be padded so that it can represented as a tensor
|
||||||
"""
|
"""
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
tokenizer = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name}, {tokenizer.__class__.__name__})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name}, {tokenizer.__class__.__name__})"):
|
||||||
if is_torch_available():
|
if is_torch_available():
|
||||||
@ -3663,8 +3694,8 @@ class TokenizerTesterMixin:
|
|||||||
|
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
if hasattr(tokenizer_p, "add_prefix_space") and not tokenizer_p.add_prefix_space:
|
if hasattr(tokenizer_p, "add_prefix_space") and not tokenizer_p.add_prefix_space:
|
||||||
continue # Too hard to test for now
|
continue # Too hard to test for now
|
||||||
@ -3745,8 +3776,8 @@ class TokenizerTesterMixin:
|
|||||||
|
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
input_simple = [1, 2, 3]
|
input_simple = [1, 2, 3]
|
||||||
input_pair = [1, 2, 3]
|
input_pair = [1, 2, 3]
|
||||||
|
|
||||||
@ -3767,8 +3798,8 @@ class TokenizerTesterMixin:
|
|||||||
|
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
# # Input string
|
# # Input string
|
||||||
# input_simple = tokenizer_p.tokenize("This is a sample input", add_special_tokens=False)
|
# input_simple = tokenizer_p.tokenize("This is a sample input", add_special_tokens=False)
|
||||||
# input_pair = tokenizer_p.tokenize("This is a sample pair", add_special_tokens=False)
|
# input_pair = tokenizer_p.tokenize("This is a sample pair", add_special_tokens=False)
|
||||||
@ -3812,8 +3843,8 @@ class TokenizerTesterMixin:
|
|||||||
|
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
|
self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
|
||||||
pad_token_id = tokenizer_p.pad_token_id
|
pad_token_id = tokenizer_p.pad_token_id
|
||||||
@ -4038,8 +4069,8 @@ class TokenizerTesterMixin:
|
|||||||
|
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
|
self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
|
||||||
pad_token_id = tokenizer_p.pad_token_id
|
pad_token_id = tokenizer_p.pad_token_id
|
||||||
|
|
||||||
@ -4076,8 +4107,8 @@ class TokenizerTesterMixin:
|
|||||||
|
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
tmpdirname2 = tempfile.mkdtemp()
|
tmpdirname2 = tempfile.mkdtemp()
|
||||||
|
|
||||||
@ -4151,8 +4182,8 @@ class TokenizerTesterMixin:
|
|||||||
|
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
sentence = "A, <mask> AllenNLP sentence."
|
sentence = "A, <mask> AllenNLP sentence."
|
||||||
tokens_r = tokenizer_r.encode_plus(
|
tokens_r = tokenizer_r.encode_plus(
|
||||||
sentence,
|
sentence,
|
||||||
@ -4176,7 +4207,7 @@ class TokenizerTesterMixin:
|
|||||||
def test_compare_add_special_tokens(self):
|
def test_compare_add_special_tokens(self):
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
|
|
||||||
simple_num_special_tokens_to_add = tokenizer_r.num_special_tokens_to_add(pair=False)
|
simple_num_special_tokens_to_add = tokenizer_r.num_special_tokens_to_add(pair=False)
|
||||||
# pair_num_special_tokens_to_add = tokenizer_r.num_special_tokens_to_add(pair=True)
|
# pair_num_special_tokens_to_add = tokenizer_r.num_special_tokens_to_add(pair=True)
|
||||||
@ -4219,8 +4250,8 @@ class TokenizerTesterMixin:
|
|||||||
|
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
|
tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
|
||||||
string_sequence = "Asserting that both tokenizers are equal"
|
string_sequence = "Asserting that both tokenizers are equal"
|
||||||
python_output = tokenizer_p.prepare_for_model(
|
python_output = tokenizer_p.prepare_for_model(
|
||||||
tokenizer_p.encode(string_sequence, add_special_tokens=False)
|
tokenizer_p.encode(string_sequence, add_special_tokens=False)
|
||||||
@ -4235,7 +4266,7 @@ class TokenizerTesterMixin:
|
|||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
added_tokens = [AddedToken("<special>", lstrip=True)]
|
added_tokens = [AddedToken("<special>", lstrip=True)]
|
||||||
tokenizer_r = self.rust_tokenizer_class.from_pretrained(
|
tokenizer_r = self.get_rust_tokenizer(
|
||||||
pretrained_name, additional_special_tokens=added_tokens, **kwargs
|
pretrained_name, additional_special_tokens=added_tokens, **kwargs
|
||||||
)
|
)
|
||||||
r_output = tokenizer_r.encode("Hey this is a <special> token")
|
r_output = tokenizer_r.encode("Hey this is a <special> token")
|
||||||
@ -4246,12 +4277,10 @@ class TokenizerTesterMixin:
|
|||||||
|
|
||||||
if self.test_slow_tokenizer:
|
if self.test_slow_tokenizer:
|
||||||
# in rust fast, you lose the information of the AddedToken when initializing with `additional_special_tokens`
|
# in rust fast, you lose the information of the AddedToken when initializing with `additional_special_tokens`
|
||||||
tokenizer_cr = self.rust_tokenizer_class.from_pretrained(
|
tokenizer_cr = self.get_rust_tokenizer(
|
||||||
pretrained_name, additional_special_tokens=added_tokens, **kwargs, from_slow=True
|
pretrained_name, additional_special_tokens=added_tokens, **kwargs, from_slow=True
|
||||||
)
|
)
|
||||||
tokenizer_p = self.tokenizer_class.from_pretrained(
|
tokenizer_p = self.get_tokenizer(pretrained_name, additional_special_tokens=added_tokens, **kwargs)
|
||||||
pretrained_name, additional_special_tokens=added_tokens, **kwargs
|
|
||||||
)
|
|
||||||
|
|
||||||
p_output = tokenizer_p.encode("Hey this is a <special> token")
|
p_output = tokenizer_p.encode("Hey this is a <special> token")
|
||||||
|
|
||||||
@ -4498,7 +4527,7 @@ class TokenizerTesterMixin:
|
|||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||||
# Save the fast tokenizer files in a temporary directory
|
# Save the fast tokenizer files in a temporary directory
|
||||||
tokenizer_old = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs, use_fast=True)
|
tokenizer_old = self.get_rust_tokenizer(pretrained_name, **kwargs, use_fast=True)
|
||||||
tokenizer_old.save_pretrained(tmp_dir, legacy_format=False) # save only fast version
|
tokenizer_old.save_pretrained(tmp_dir, legacy_format=False) # save only fast version
|
||||||
|
|
||||||
# Initialize toy model for the trainer
|
# Initialize toy model for the trainer
|
||||||
@ -4532,13 +4561,11 @@ class TokenizerTesterMixin:
|
|||||||
with tempfile.TemporaryDirectory() as tmp_dir_1:
|
with tempfile.TemporaryDirectory() as tmp_dir_1:
|
||||||
# Here we check that even if we have initialized a fast tokenizer with a tokenizer_file we can
|
# Here we check that even if we have initialized a fast tokenizer with a tokenizer_file we can
|
||||||
# still save only the slow version and use these saved files to rebuild a tokenizer
|
# still save only the slow version and use these saved files to rebuild a tokenizer
|
||||||
tokenizer_fast_old_1 = self.rust_tokenizer_class.from_pretrained(
|
tokenizer_fast_old_1 = self.get_rust_tokenizer(pretrained_name, **kwargs, use_fast=True)
|
||||||
pretrained_name, **kwargs, use_fast=True
|
|
||||||
)
|
|
||||||
tokenizer_file = os.path.join(tmp_dir_1, "tokenizer.json")
|
tokenizer_file = os.path.join(tmp_dir_1, "tokenizer.json")
|
||||||
tokenizer_fast_old_1.backend_tokenizer.save(tokenizer_file)
|
tokenizer_fast_old_1.backend_tokenizer.save(tokenizer_file)
|
||||||
|
|
||||||
tokenizer_fast_old_2 = self.rust_tokenizer_class.from_pretrained(
|
tokenizer_fast_old_2 = self.get_rust_tokenizer(
|
||||||
pretrained_name, **kwargs, use_fast=True, tokenizer_file=tokenizer_file
|
pretrained_name, **kwargs, use_fast=True, tokenizer_file=tokenizer_file
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -4560,10 +4587,10 @@ class TokenizerTesterMixin:
|
|||||||
special_token = "<my_new_token>"
|
special_token = "<my_new_token>"
|
||||||
special_sentence = f"Hey this is a {special_token} token"
|
special_sentence = f"Hey this is a {special_token} token"
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
tokenizer_rust = self.rust_tokenizer_class.from_pretrained(
|
tokenizer_rust = self.get_rust_tokenizer(
|
||||||
pretrained_name, additional_special_tokens=[special_token], split_special_tokens=True, **kwargs
|
pretrained_name, additional_special_tokens=[special_token], split_special_tokens=True, **kwargs
|
||||||
)
|
)
|
||||||
tokenizer_py = self.tokenizer_class.from_pretrained(
|
tokenizer_py = self.get_tokenizer(
|
||||||
pretrained_name, additional_special_tokens=[special_token], split_special_tokens=True, **kwargs
|
pretrained_name, additional_special_tokens=[special_token], split_special_tokens=True, **kwargs
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -4622,7 +4649,7 @@ class TokenizerTesterMixin:
|
|||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
# Load a slow tokenizer from the hub, init with the new token for fast to also include it
|
# Load a slow tokenizer from the hub, init with the new token for fast to also include it
|
||||||
tokenizer = self.tokenizer_class.from_pretrained(pretrained_name, eos_token=new_eos)
|
tokenizer = self.get_tokenizer(pretrained_name, eos_token=new_eos)
|
||||||
EXPECTED_ADDED_TOKENS_DECODER = tokenizer.added_tokens_decoder
|
EXPECTED_ADDED_TOKENS_DECODER = tokenizer.added_tokens_decoder
|
||||||
with self.subTest("Hub -> Slow: Test loading a slow tokenizer from the hub)"):
|
with self.subTest("Hub -> Slow: Test loading a slow tokenizer from the hub)"):
|
||||||
self.assertEqual(tokenizer._special_tokens_map["eos_token"], new_eos)
|
self.assertEqual(tokenizer._special_tokens_map["eos_token"], new_eos)
|
||||||
@ -4662,7 +4689,7 @@ class TokenizerTesterMixin:
|
|||||||
|
|
||||||
with self.subTest("Hub -> Fast: Test loading a fast tokenizer from the hub)"):
|
with self.subTest("Hub -> Fast: Test loading a fast tokenizer from the hub)"):
|
||||||
if self.rust_tokenizer_class is not None:
|
if self.rust_tokenizer_class is not None:
|
||||||
tokenizer_fast = self.rust_tokenizer_class.from_pretrained(pretrained_name, eos_token=new_eos)
|
tokenizer_fast = self.get_rust_tokenizer(pretrained_name, eos_token=new_eos)
|
||||||
self.assertEqual(tokenizer_fast._special_tokens_map["eos_token"], new_eos)
|
self.assertEqual(tokenizer_fast._special_tokens_map["eos_token"], new_eos)
|
||||||
self.assertIn(new_eos, list(tokenizer_fast.added_tokens_decoder.values()))
|
self.assertIn(new_eos, list(tokenizer_fast.added_tokens_decoder.values()))
|
||||||
# We can't test the following because for BC we kept the default rstrip lstrip in slow not fast. Will comment once normalization is alright
|
# We can't test the following because for BC we kept the default rstrip lstrip in slow not fast. Will comment once normalization is alright
|
||||||
|
@ -33,19 +33,20 @@ class PreTrainedTokenizationFastTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
test_rust_tokenizer = True
|
test_rust_tokenizer = True
|
||||||
from_pretrained_vocab_key = "tokenizer_file"
|
from_pretrained_vocab_key = "tokenizer_file"
|
||||||
|
|
||||||
def setUp(self):
|
@classmethod
|
||||||
self.test_rust_tokenizer = False # because we don't have pretrained_vocab_files_map
|
def setUpClass(cls):
|
||||||
super().setUp()
|
cls.test_rust_tokenizer = False # because we don't have pretrained_vocab_files_map
|
||||||
self.test_rust_tokenizer = True
|
super().setUpClass()
|
||||||
|
cls.test_rust_tokenizer = True
|
||||||
|
|
||||||
model_paths = ["robot-test/dummy-tokenizer-fast", "robot-test/dummy-tokenizer-wordlevel"]
|
model_paths = ["robot-test/dummy-tokenizer-fast", "robot-test/dummy-tokenizer-wordlevel"]
|
||||||
self.bytelevel_bpe_model_name = "SaulLu/dummy-tokenizer-bytelevel-bpe"
|
cls.bytelevel_bpe_model_name = "SaulLu/dummy-tokenizer-bytelevel-bpe"
|
||||||
|
|
||||||
# Inclusion of 2 tokenizers to test different types of models (Unigram and WordLevel for the moment)
|
# Inclusion of 2 tokenizers to test different types of models (Unigram and WordLevel for the moment)
|
||||||
self.tokenizers_list = [(PreTrainedTokenizerFast, model_path, {}) for model_path in model_paths]
|
cls.tokenizers_list = [(PreTrainedTokenizerFast, model_path, {}) for model_path in model_paths]
|
||||||
|
|
||||||
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_paths[0])
|
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_paths[0])
|
||||||
tokenizer.save_pretrained(self.tmpdirname)
|
tokenizer.save_pretrained(cls.tmpdirname)
|
||||||
|
|
||||||
@unittest.skip(
|
@unittest.skip(
|
||||||
"We disable this test for PreTrainedTokenizerFast because it is the only tokenizer that is not linked to any model"
|
"We disable this test for PreTrainedTokenizerFast because it is the only tokenizer that is not linked to any model"
|
||||||
|
Loading…
Reference in New Issue
Block a user