# Copyright 2024 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import inspect import pickle import shutil import tempfile import unittest from transformers import ( SPIECE_UNDERLINE, AddedToken, AutoTokenizer, PreTrainedTokenizerFast, SpecialTokensMixin, ) from transformers.convert_slow_tokenizer import MoshiConverter from transformers.testing_utils import ( get_tests_dir, nested_simplify, require_sentencepiece, require_tokenizers, require_torch, ) from ...test_tokenization_common import SMALL_TRAINING_CORPUS, TokenizerTesterMixin SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") @require_sentencepiece @require_tokenizers class MoshiTokenizationTest(TokenizerTesterMixin, unittest.TestCase): from_pretrained_id = ["kmhf/hf-moshiko"] rust_tokenizer_class = PreTrainedTokenizerFast test_slow_tokenizer = False test_rust_tokenizer = True from_pretrained_kwargs = {} @classmethod def setUpClass(cls): super().setUpClass() # We have a SentencePiece fixture for testing tokenizer = PreTrainedTokenizerFast( tokenizer_object=MoshiConverter(vocab_file=SAMPLE_VOCAB).converted(), bos_token="", unk_token="", eos_token="", ) tokenizer.pad_token = tokenizer.eos_token tokenizer.save_pretrained(cls.tmpdirname) def get_rust_tokenizer(cls, pretrained_name=None, **kwargs) -> PreTrainedTokenizerFast: pretrained_name = pretrained_name or cls.tmpdirname return cls.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) @unittest.skip(reason="No slow tokenizer") def test_added_tokens_serialization(self): pass @unittest.skip(reason="PreTrainedTokenizerFast doesn't have tokenizer_file in its signature") def test_rust_tokenizer_signature(self): pass @unittest.skip(reason="No slow tokenizer") def test_encode_decode_with_spaces(self): pass def test_full_tokenizer(self): tokenizer = PreTrainedTokenizerFast( tokenizer_object=MoshiConverter(vocab_file=SAMPLE_VOCAB).converted(), bos_token="", unk_token="", eos_token="", ) tokens = tokenizer.tokenize("This is a test") self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"]) self.assertListEqual( tokenizer.convert_tokens_to_ids(tokens), [285, 46, 10, 170, 382], ) tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.") self.assertListEqual( tokens, [ SPIECE_UNDERLINE + "I", SPIECE_UNDERLINE + "was", SPIECE_UNDERLINE + "b", "or", "n", SPIECE_UNDERLINE + "in", SPIECE_UNDERLINE + "", "9", "2", "0", "0", "0", ",", SPIECE_UNDERLINE + "and", SPIECE_UNDERLINE + "this", SPIECE_UNDERLINE + "is", SPIECE_UNDERLINE + "f", "al", "s", "é", ".", ], ) ids = tokenizer.convert_tokens_to_ids(tokens) self.assertListEqual( ids, [8, 21, 84, 55, 24, 19, 7, 0, 602, 347, 347, 347, 3, 12, 66, 46, 72, 80, 6, 0, 4], ) back_tokens = tokenizer.convert_ids_to_tokens(ids) self.assertListEqual( back_tokens, [ SPIECE_UNDERLINE + "I", SPIECE_UNDERLINE + "was", SPIECE_UNDERLINE + "b", "or", "n", SPIECE_UNDERLINE + "in", SPIECE_UNDERLINE + "", "", "2", "0", "0", "0", ",", SPIECE_UNDERLINE + "and", SPIECE_UNDERLINE + "this", SPIECE_UNDERLINE + "is", SPIECE_UNDERLINE + "f", "al", "s", "", ".", ], ) def test_special_tokens_initialization(self): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): added_tokens = [AddedToken("", lstrip=True)] tokenizer_r = self.rust_tokenizer_class.from_pretrained( pretrained_name, additional_special_tokens=added_tokens, **kwargs ) r_output = tokenizer_r.encode("Hey this is a token") special_token_id = tokenizer_r.encode("", add_special_tokens=False)[0] self.assertTrue(special_token_id in r_output) def test_picklable(self): with tempfile.NamedTemporaryFile() as f: shutil.copyfile(SAMPLE_VOCAB, f.name) tokenizer = PreTrainedTokenizerFast( tokenizer_object=MoshiConverter(vocab_file=f.name).converted(), bos_token="", unk_token="", eos_token="", ) pickled_tokenizer = pickle.dumps(tokenizer) pickle.loads(pickled_tokenizer) def test_training_new_tokenizer(self): # This feature only exists for fast tokenizers if not self.test_rust_tokenizer: self.skipTest(reason="test_rust_tokenizer is set to False") tokenizer = self.get_rust_tokenizer() new_tokenizer = tokenizer.train_new_from_iterator(SMALL_TRAINING_CORPUS, 100) # Test we can use the new tokenizer with something not seen during training inputs = new_tokenizer(["This is the first sentence", "This sentence is different 🤗."]) self.assertEqual(len(inputs["input_ids"]), 2) decoded_input = new_tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=True) expected_result = "This is the first sentence" self.assertEqual(expected_result, decoded_input) # We check that the parameters of the tokenizer remained the same # Check we have the same number of added_tokens for both pair and non-pair inputs. self.assertEqual(tokenizer.num_special_tokens_to_add(False), new_tokenizer.num_special_tokens_to_add(False)) self.assertEqual(tokenizer.num_special_tokens_to_add(True), new_tokenizer.num_special_tokens_to_add(True)) # Check we have the correct max_length for both pair and non-pair inputs. self.assertEqual(tokenizer.max_len_single_sentence, new_tokenizer.max_len_single_sentence) self.assertEqual(tokenizer.max_len_sentences_pair, new_tokenizer.max_len_sentences_pair) # Assert the set of special tokens match as we didn't ask to change them self.assertSequenceEqual( tokenizer.all_special_tokens_extended, new_tokenizer.all_special_tokens_extended, ) self.assertDictEqual(tokenizer.special_tokens_map, new_tokenizer.special_tokens_map) def test_training_new_tokenizer_with_special_tokens_change(self): # This feature only exists for fast tokenizers if not self.test_rust_tokenizer: self.skipTest(reason="test_rust_tokenizer is set to False") tokenizer = self.get_rust_tokenizer() # Test with a special tokens map class_signature = inspect.signature(tokenizer.__class__) if "cls_token" in class_signature.parameters: new_tokenizer = tokenizer.train_new_from_iterator( SMALL_TRAINING_CORPUS, 100, special_tokens_map={tokenizer.cls_token: ""} ) cls_id = new_tokenizer.get_vocab()[""] self.assertEqual(new_tokenizer.cls_token, "") self.assertEqual(new_tokenizer.cls_token_id, cls_id) # Create a new mapping from the special tokens defined in the original tokenizer special_tokens_list = SpecialTokensMixin.SPECIAL_TOKENS_ATTRIBUTES.copy() special_tokens_list.remove("additional_special_tokens") special_tokens_map = {} for token in special_tokens_list: # Get the private one to avoid unnecessary warnings. if getattr(tokenizer, token) is not None: special_token = getattr(tokenizer, token) special_tokens_map[special_token] = f"{special_token}a" # Train new tokenizer new_tokenizer = tokenizer.train_new_from_iterator( SMALL_TRAINING_CORPUS, 100, special_tokens_map=special_tokens_map ) # Check the changes for token in special_tokens_list: # Get the private one to avoid unnecessary warnings. if getattr(tokenizer, token) is None: continue special_token = getattr(tokenizer, token) if special_token in special_tokens_map: new_special_token = getattr(new_tokenizer, token) self.assertEqual(special_tokens_map[special_token], new_special_token) new_id = new_tokenizer.get_vocab()[new_special_token] self.assertEqual(getattr(new_tokenizer, f"{token}_id"), new_id) # Check if the AddedToken / string format has been kept for special_token in tokenizer.all_special_tokens_extended: if isinstance(special_token, AddedToken) and special_token.content not in special_tokens_map: # The special token must appear identically in the list of the new tokenizer. self.assertTrue( special_token in new_tokenizer.all_special_tokens_extended, f"'{special_token}' should be in {new_tokenizer.all_special_tokens_extended}", ) elif isinstance(special_token, AddedToken): # The special token must appear in the list of the new tokenizer as an object of type AddedToken with # the same parameters as the old AddedToken except the content that the user has requested to change. special_token_str = special_token.content new_special_token_str = special_tokens_map[special_token_str] find = False for candidate in new_tokenizer.all_special_tokens_extended: if ( isinstance(candidate, AddedToken) and candidate.content == new_special_token_str and candidate.lstrip == special_token.lstrip and candidate.rstrip == special_token.rstrip and candidate.normalized == special_token.normalized and candidate.single_word == special_token.single_word ): find = True break special_token.content = new_special_token_str self.assertTrue( find, f"'{special_token.__repr__()}' should appear as an `AddedToken` in the all_special_tokens_extended = " f"{[k for k in new_tokenizer.all_special_tokens_extended if str(k) == new_special_token_str]} but it is missing" ", this means that the new tokenizers did not keep the `rstrip`, `lstrip`, `normalized` etc attributes.", ) elif special_token not in special_tokens_map: # The special token must appear identically in the list of the new tokenizer. self.assertTrue( special_token in new_tokenizer.all_special_tokens_extended, f"'{special_token.__repr__()}' should be in {new_tokenizer.all_special_tokens_extended}", ) else: # The special token must appear in the list of the new tokenizer as an object of type string. self.assertTrue(special_tokens_map[special_token] in new_tokenizer.all_special_tokens_extended) # Test we can use the new tokenizer with something not seen during training inputs = new_tokenizer(["This is the first sentence", "This sentence is different 🤗."]) self.assertEqual(len(inputs["input_ids"]), 2) decoded_input = new_tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=True) expected_result = "This is the first sentence" self.assertEqual(expected_result, decoded_input) def test_alignement_methods(self): # TODO: @ArthurZucker - alignment is broken pass def test_added_tokens_do_lower_case(self): # TODO: @ArthurZucker pass @require_torch @require_sentencepiece @require_tokenizers class MoshiIntegrationTest(unittest.TestCase): @classmethod def setUpClass(cls): checkpoint_name = "kmhf/hf-moshiko" cls.rust_tokenizer = AutoTokenizer.from_pretrained(checkpoint_name) return cls @require_torch def integration_tests(self): inputs = self.tokenizer( ["The following string should be properly encoded: Hello.", "But ird and ปี ird ด"], return_tensors="pt", ) long_attention_mask = [1] * 21 # fmt: off self.assertEqual( nested_simplify(inputs), { "input_ids": [ [287, 547, 2359, 457, 297, 3708, 11488, 279, 11725, 263], [588, 478, 1442, 267, 260, 228, 188, 159, 228, 188, 185, 260, 260, 478, 1442, 260, 260, 260, 228, 188, 152], ], "attention_mask": [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], long_attention_mask], }, ) # fmt: on def test_fast_special_tokens(self): fast_tokenizer = self.rust_tokenizer fast_tokenizer.add_eos_token = False fast = fast_tokenizer.encode("A sample test", add_special_tokens=True) assert fast == [318, 1145, 694] fast_tokenizer.add_eos_token = True fast = fast_tokenizer.encode("A sample test", add_special_tokens=True) assert fast == [318, 1145, 694] self.rust_tokenizer.add_eos_token = False def test_simple_encode_decode(self): rust_tokenizer = self.rust_tokenizer self.assertEqual(rust_tokenizer.encode("This is a test"), [353, 275, 272, 694]) self.assertEqual(rust_tokenizer.decode([353, 275, 272, 694], skip_special_tokens=True), "This is a test") # bytefallback showcase bytefallback_tokens = [260, 235, 152, 163, 234, 184, 191, 13340, 235, 160, 163, 236, 180, 159, 234, 156, 179] # fmt: skip self.assertEqual(rust_tokenizer.encode("生活的真谛是"), bytefallback_tokens) self.assertEqual( rust_tokenizer.decode(bytefallback_tokens, skip_special_tokens=True), "生活的真谛是", ) # Inner spaces showcase self.assertEqual(rust_tokenizer.encode("Hi Hello"), [2769, 260, 11725]) self.assertEqual(rust_tokenizer.decode([2769, 260, 11725], skip_special_tokens=True), "Hi Hello") self.assertEqual(rust_tokenizer.encode("Hi Hello"), [2769, 260, 260, 11725]) self.assertEqual(rust_tokenizer.decode([2769, 260, 260, 11725], skip_special_tokens=True), "Hi Hello") # TODO: @ArthurZucker # self.assertEqual(rust_tokenizer.encode(""), []) # self.assertEqual(rust_tokenizer.encode(" "), [260, 260]) # self.assertEqual(rust_tokenizer.encode(" "), [260, 260, 260]) # self.assertEqual(rust_tokenizer.encode(" Hello"), [260, 11725]) # self.assertEqual(rust_tokenizer.encode(""), [607, 266, 578]) def test_no_differences_decode(self): rust_tokenizer = self.rust_tokenizer self.assertEqual(rust_tokenizer.decode([869]), "levels") self.assertEqual(rust_tokenizer.decode([30112, 869]), "unanswered levels") @require_sentencepiece @require_tokenizers class CommonSpmIntegrationTests(unittest.TestCase): """ A class that regroups important test to make sure that we properly handle the special tokens. """ def test_edge_case_tabulation(self): fast_tokenizer = AutoTokenizer.from_pretrained("kmhf/hf-moshiko") input_text = "Hey. \t\t \n\nyou é @#😈 🤗! , 1234 15 5,61" EXPECTED_IDS = [11510, 934, 4451, 266, 578, 263, 260, 13, 13, 260, 14, 14, 5209, 260, 260, 1202, 260, 527, 1322, 244, 163, 156, 140, 260, 260, 244, 163, 168, 155, 430, 1047, 261, 260, 265, 270, 278, 281, 260, 265, 280, 260, 280, 261, 285, 265] # fmt: skip EXPECTED_TOKENS = ['▁Hey', '<', 'eo', 's', '>', '.', '▁', '<0x09>', '<0x09>', '▁', '<0x0A>', '<0x0A>', 'you', '▁', '▁', 'é', '▁', '▁@', '#', '<0xF0>', '<0x9F>', '<0x98>', '<0x88>', '▁', '▁', '<0xF0>', '<0x9F>', '<0xA4>', '<0x97>', '!', '▁▁▁▁▁▁▁', ',', '▁', '1', '2', '3', '4', '▁', '1', '5', '▁', '5', ',', '6', '1'] # fmt: skip tokens = fast_tokenizer.tokenize(input_text) with self.subTest("test fast edge case fast"): self.assertEqual(tokens, EXPECTED_TOKENS) input_ids = fast_tokenizer.encode(input_text) with self.subTest("test fast edge case fast"): self.assertEqual(input_ids, EXPECTED_IDS) text = fast_tokenizer.decode(EXPECTED_IDS) with self.subTest("test fast edge case fast"): self.assertEqual(text, "Hey. \t\t \n\nyou é @#😈 🤗! , 1234 15 5,61") input_text = "\t\t\t\t \n\n61" EXPECTED_IDS = [260, 13, 13, 13, 13, 260, 14, 14, 285, 265] EXPECTED_TOKENS = ["▁", "<0x09>", "<0x09>", "<0x09>", "<0x09>", "▁", "<0x0A>", "<0x0A>", "6", "1"] tokens = fast_tokenizer.tokenize(input_text) with self.subTest("test fast edge case fast"): self.assertEqual(tokens, EXPECTED_TOKENS) input_ids = fast_tokenizer.encode(input_text) with self.subTest("test fast edge case fast"): self.assertEqual(input_ids, EXPECTED_IDS) text = fast_tokenizer.decode(EXPECTED_IDS) with self.subTest("test fast edge case fast"): self.assertEqual(text, "\t\t\t\t \n\n61")