diff --git a/src/transformers/tokenization_utils_fast.py b/src/transformers/tokenization_utils_fast.py index ad53b81fe4e..cdb606e7c60 100644 --- a/src/transformers/tokenization_utils_fast.py +++ b/src/transformers/tokenization_utils_fast.py @@ -21,6 +21,7 @@ import os from collections import defaultdict from typing import Any, Dict, List, Optional, Tuple, Union +import tokenizers.pre_tokenizers as pre_tokenizers_fast from tokenizers import Encoding as EncodingFast from tokenizers import Tokenizer as TokenizerFast from tokenizers.decoders import Decoder as DecoderFast @@ -699,6 +700,8 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): kwargs["end_of_word_suffix"] = tokenizer_json["model"]["end_of_word_suffix"] if tokenizer_json["model"]["type"] == "Unigram" and unk_token is not None: kwargs["unk_token"] = unk_token + if tokenizer_json["pre_tokenizer"]["type"] == "ByteLevel": + kwargs["initial_alphabet"] = pre_tokenizers_fast.ByteLevel.alphabet() trainer_class = MODEL_TO_TRAINER_MAPPING[tokenizer_json["model"]["type"]] trainer = trainer_class(vocab_size=vocab_size, special_tokens=special_tokens, **kwargs) diff --git a/tests/models/bart/test_modeling_bart.py b/tests/models/bart/test_modeling_bart.py index 01c49675161..b36bda3b71e 100644 --- a/tests/models/bart/test_modeling_bart.py +++ b/tests/models/bart/test_modeling_bart.py @@ -150,6 +150,7 @@ class BartModelTester: def get_pipeline_config(self): config = self.get_config() config.max_position_embeddings = 100 + config.vocab_size = 300 return config def prepare_config_and_inputs_for_common(self): diff --git a/tests/models/blenderbot/test_modeling_blenderbot.py b/tests/models/blenderbot/test_modeling_blenderbot.py index ec626d05e8c..ee76626ffed 100644 --- a/tests/models/blenderbot/test_modeling_blenderbot.py +++ b/tests/models/blenderbot/test_modeling_blenderbot.py @@ -140,6 +140,7 @@ class BlenderbotModelTester: def get_pipeline_config(self): config = self.get_config() config.max_position_embeddings = 100 + config.vocab_size = 300 return config def prepare_config_and_inputs_for_common(self): diff --git a/tests/models/deberta/test_modeling_deberta.py b/tests/models/deberta/test_modeling_deberta.py index ee1ba57414d..940a82db439 100644 --- a/tests/models/deberta/test_modeling_deberta.py +++ b/tests/models/deberta/test_modeling_deberta.py @@ -130,6 +130,11 @@ class DebertaModelTester(object): pos_att_type=self.pos_att_type, ) + def get_pipeline_config(self): + config = self.get_config() + config.vocab_size = 300 + return config + def check_loss_output(self, result): self.parent.assertListEqual(list(result.loss.size()), []) diff --git a/tests/models/gpt2/test_modeling_gpt2.py b/tests/models/gpt2/test_modeling_gpt2.py index c474f519d00..0960daff836 100644 --- a/tests/models/gpt2/test_modeling_gpt2.py +++ b/tests/models/gpt2/test_modeling_gpt2.py @@ -166,6 +166,11 @@ class GPT2ModelTester: reorder_and_upcast_attn=reorder_and_upcast_attn, ) + def get_pipeline_config(self): + config = self.get_config() + config.vocab_size = 300 + return config + def prepare_config_and_inputs_for_decoder(self): ( config, diff --git a/tests/models/gpt_neo/test_modeling_gpt_neo.py b/tests/models/gpt_neo/test_modeling_gpt_neo.py index f8607cf1edb..16a775e2731 100644 --- a/tests/models/gpt_neo/test_modeling_gpt_neo.py +++ b/tests/models/gpt_neo/test_modeling_gpt_neo.py @@ -151,6 +151,11 @@ class GPTNeoModelTester: attention_types=self.attention_types, ) + def get_pipeline_config(self): + config = self.get_config() + config.vocab_size = 300 + return config + def prepare_config_and_inputs_for_decoder(self): ( config, diff --git a/tests/models/gptj/test_modeling_gptj.py b/tests/models/gptj/test_modeling_gptj.py index 23ca46eb828..b8b088d42f1 100644 --- a/tests/models/gptj/test_modeling_gptj.py +++ b/tests/models/gptj/test_modeling_gptj.py @@ -155,6 +155,11 @@ class GPTJModelTester: rotary_dim=self.rotary_dim, ) + def get_pipeline_config(self): + config = self.get_config() + config.vocab_size = 300 + return config + def prepare_config_and_inputs_for_decoder(self): ( config, diff --git a/tests/models/ibert/test_modeling_ibert.py b/tests/models/ibert/test_modeling_ibert.py index f8e7b2da2b9..78ba4d4604d 100644 --- a/tests/models/ibert/test_modeling_ibert.py +++ b/tests/models/ibert/test_modeling_ibert.py @@ -116,6 +116,11 @@ class IBertModelTester: quant_mode=True, ) + def get_pipeline_config(self): + config = self.get_config() + config.vocab_size = 300 + return config + def create_and_check_model( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels ): diff --git a/tests/models/led/test_modeling_led.py b/tests/models/led/test_modeling_led.py index e96f91ecc9d..e7dc31838aa 100644 --- a/tests/models/led/test_modeling_led.py +++ b/tests/models/led/test_modeling_led.py @@ -163,6 +163,7 @@ class LEDModelTester: def get_pipeline_config(self): config = self.get_config() config.max_position_embeddings = 100 + config.vocab_size = 300 return config def prepare_config_and_inputs_for_common(self): diff --git a/tests/models/longformer/test_modeling_longformer.py b/tests/models/longformer/test_modeling_longformer.py index fd10b14eaea..c1839d67d36 100644 --- a/tests/models/longformer/test_modeling_longformer.py +++ b/tests/models/longformer/test_modeling_longformer.py @@ -113,6 +113,11 @@ class LongformerModelTester: attention_window=self.attention_window, ) + def get_pipeline_config(self): + config = self.get_config() + config.vocab_size = 300 + return config + def create_and_check_attention_mask_determinism( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels ): diff --git a/tests/models/roberta/test_modeling_roberta.py b/tests/models/roberta/test_modeling_roberta.py index e0b8b78b3b6..7163a357021 100644 --- a/tests/models/roberta/test_modeling_roberta.py +++ b/tests/models/roberta/test_modeling_roberta.py @@ -112,6 +112,11 @@ class RobertaModelTester: initializer_range=self.initializer_range, ) + def get_pipeline_config(self): + config = self.get_config() + config.vocab_size = 300 + return config + def prepare_config_and_inputs_for_decoder(self): ( config, diff --git a/tests/models/yoso/test_modeling_yoso.py b/tests/models/yoso/test_modeling_yoso.py index d71b051d0a2..0a0749dd7d9 100644 --- a/tests/models/yoso/test_modeling_yoso.py +++ b/tests/models/yoso/test_modeling_yoso.py @@ -126,6 +126,11 @@ class YosoModelTester: initializer_range=self.initializer_range, ) + def get_pipeline_config(self): + config = self.get_config() + config.vocab_size = 300 + return config + def prepare_config_and_inputs_for_decoder(self): ( config, diff --git a/tests/tokenization/test_tokenization_fast.py b/tests/tokenization/test_tokenization_fast.py index 9e5ad178e53..da98d17d772 100644 --- a/tests/tokenization/test_tokenization_fast.py +++ b/tests/tokenization/test_tokenization_fast.py @@ -39,6 +39,7 @@ class PreTrainedTokenizationFastTest(TokenizerTesterMixin, unittest.TestCase): self.test_rust_tokenizer = True model_paths = ["robot-test/dummy-tokenizer-fast", "robot-test/dummy-tokenizer-wordlevel"] + self.bytelevel_bpe_model_name = "SaulLu/dummy-tokenizer-bytelevel-bpe" # Inclusion of 2 tokenizers to test different types of models (Unigram and WordLevel for the moment) self.tokenizers_list = [(PreTrainedTokenizerFast, model_path, {}) for model_path in model_paths] @@ -99,6 +100,15 @@ class PreTrainedTokenizationFastTest(TokenizerTesterMixin, unittest.TestCase): shutil.rmtree(self.tmpdirname) self.tmpdirname = tmpdirname_orig + def test_training_new_tokenizer_with_bytelevel(self): + tokenizer = self.rust_tokenizer_class.from_pretrained(self.bytelevel_bpe_model_name) + + toy_text_iterator = ("a" for _ in range(1000)) + new_tokenizer = tokenizer.train_new_from_iterator(text_iterator=toy_text_iterator, length=1000, vocab_size=50) + + encoding_ids = new_tokenizer.encode("a🤗") + self.assertEqual(encoding_ids, [64, 172, 253, 97, 245]) + @require_tokenizers class TokenizerVersioningTest(unittest.TestCase):