mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-19 20:48:22 +06:00
[GemmaConverter
] use user_defined_symbols (#29473)
* use user_defined_symbols * fixup * nit * add a very robust test * make sure all models are tested with the `pretrained_tokenizer_to_test` * should we make sure we test all of them? * merge * remove the id * fix test * update * ousies * oups * fixup * fix copies check * remove `pretrained_tokenizer_to_test`
This commit is contained in:
parent
8e2fc52ea3
commit
2f9a3edbb9
@ -1319,7 +1319,10 @@ class GemmaConvert(SpmConverter):
|
|||||||
raise Exception(
|
raise Exception(
|
||||||
"You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
|
"You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
|
||||||
)
|
)
|
||||||
|
user_defined_symbols = [
|
||||||
|
AddedToken(token, normalized=False, special=False) for token in proto.trainer_spec.user_defined_symbols
|
||||||
|
]
|
||||||
|
tokenizer.add_tokens(user_defined_symbols)
|
||||||
return tokenizer
|
return tokenizer
|
||||||
|
|
||||||
|
|
||||||
|
@ -52,7 +52,7 @@ if is_torch_available():
|
|||||||
@require_sentencepiece
|
@require_sentencepiece
|
||||||
@require_tokenizers
|
@require_tokenizers
|
||||||
class LlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
class LlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||||
from_pretrained_id = "hf-internal-testing/llama-tokenizer"
|
from_pretrained_id = ["hf-internal-testing/llama-tokenizer", "meta-llama/Llama-2-7b-hf"]
|
||||||
tokenizer_class = LlamaTokenizer
|
tokenizer_class = LlamaTokenizer
|
||||||
rust_tokenizer_class = LlamaTokenizerFast
|
rust_tokenizer_class = LlamaTokenizerFast
|
||||||
|
|
||||||
|
@ -51,6 +51,7 @@ from transformers.testing_utils import (
|
|||||||
get_tests_dir,
|
get_tests_dir,
|
||||||
is_pt_tf_cross_test,
|
is_pt_tf_cross_test,
|
||||||
require_jinja,
|
require_jinja,
|
||||||
|
require_read_token,
|
||||||
require_tf,
|
require_tf,
|
||||||
require_tokenizers,
|
require_tokenizers,
|
||||||
require_torch,
|
require_torch,
|
||||||
@ -200,13 +201,19 @@ class TokenizerTesterMixin:
|
|||||||
def setUp(self) -> None:
|
def setUp(self) -> None:
|
||||||
# Tokenizer.filter makes it possible to filter which Tokenizer to case based on all the
|
# Tokenizer.filter makes it possible to filter which Tokenizer to case based on all the
|
||||||
# information available in Tokenizer (name, rust class, python class, vocab key name)
|
# information available in Tokenizer (name, rust class, python class, vocab key name)
|
||||||
|
self.from_pretrained_id = (
|
||||||
|
[self.from_pretrained_id] if isinstance(self.from_pretrained_id, str) else self.from_pretrained_id
|
||||||
|
)
|
||||||
|
|
||||||
|
self.tokenizers_list = []
|
||||||
if self.test_rust_tokenizer:
|
if self.test_rust_tokenizer:
|
||||||
self.tokenizers_list = [
|
self.tokenizers_list = [
|
||||||
(
|
(
|
||||||
self.rust_tokenizer_class,
|
self.rust_tokenizer_class,
|
||||||
self.from_pretrained_id,
|
pretrained_id,
|
||||||
self.from_pretrained_kwargs if self.from_pretrained_kwargs is not None else {},
|
self.from_pretrained_kwargs if self.from_pretrained_kwargs is not None else {},
|
||||||
)
|
)
|
||||||
|
for pretrained_id in self.from_pretrained_id
|
||||||
]
|
]
|
||||||
else:
|
else:
|
||||||
self.tokenizers_list = []
|
self.tokenizers_list = []
|
||||||
@ -1544,6 +1551,56 @@ class TokenizerTesterMixin:
|
|||||||
self.assertEqual(len(overflowing_tokens), 2 + stride)
|
self.assertEqual(len(overflowing_tokens), 2 + stride)
|
||||||
self.assertEqual(overflowing_tokens, seq1_tokens[-(2 + stride) :])
|
self.assertEqual(overflowing_tokens, seq1_tokens[-(2 + stride) :])
|
||||||
|
|
||||||
|
@slow
|
||||||
|
@require_read_token
|
||||||
|
def test_encode_decode_fast_slow_all_tokens(self):
|
||||||
|
if self.rust_tokenizer_class is not None:
|
||||||
|
pretrained_name = self.from_pretrained_id
|
||||||
|
|
||||||
|
slow_tokenizer = self.tokenizer_class.from_pretrained(pretrained_name, legacy=False)
|
||||||
|
with self.subTest(f"{pretrained_name}"):
|
||||||
|
rust_tokenizer = self.rust_tokenizer_class.from_pretrained(
|
||||||
|
pretrained_name, from_slow=True, legacy=False
|
||||||
|
)
|
||||||
|
input_full_vocab_ids = list(
|
||||||
|
range(len(slow_tokenizer))
|
||||||
|
) # TODO let's maybe shuffle this! And run it 4 times. This way we cover more cmbinations
|
||||||
|
input_full_vocab_string = rust_tokenizer.convert_tokens_to_string(
|
||||||
|
rust_tokenizer.convert_ids_to_tokens(input_full_vocab_ids)
|
||||||
|
)
|
||||||
|
print(f"Length of the input string that is tested: {len(input_full_vocab_string)}")
|
||||||
|
|
||||||
|
for chunk in range(0, len(input_full_vocab_string) - 1024, 1024):
|
||||||
|
string_to_check = input_full_vocab_string[chunk : chunk + 1024]
|
||||||
|
with self.subTest(f"{(chunk/len(input_full_vocab_string))*100}%"):
|
||||||
|
slow_encode = slow_tokenizer.encode(string_to_check)
|
||||||
|
fast_encode = rust_tokenizer.encode(string_to_check)
|
||||||
|
self.assertEquals(
|
||||||
|
slow_encode,
|
||||||
|
fast_encode,
|
||||||
|
"Hint: the following tokenization diff were obtained for slow vs fast:\n "
|
||||||
|
f"elements in slow: {set(slow_tokenizer.tokenize(string_to_check))-set(rust_tokenizer.tokenize(string_to_check))} \nvs\n "
|
||||||
|
f"elements in fast: {set(rust_tokenizer.tokenize(string_to_check))-set(slow_tokenizer.tokenize(string_to_check))} \n"
|
||||||
|
f"string used : {string_to_check}",
|
||||||
|
)
|
||||||
|
print(f"Length of the input ids that is tested: {len(input_full_vocab_ids)}")
|
||||||
|
for chunk in range(0, len(input_full_vocab_ids) - 100, 100):
|
||||||
|
ids_to_decode = input_full_vocab_ids[chunk : chunk + 100]
|
||||||
|
with self.subTest(f"{(chunk/len(input_full_vocab_string))*100}%"):
|
||||||
|
self.assertEquals(
|
||||||
|
slow_tokenizer.decode(
|
||||||
|
ids_to_decode,
|
||||||
|
space_between_special_tokens=False,
|
||||||
|
clean_up_tokenization_spaces=False,
|
||||||
|
),
|
||||||
|
rust_tokenizer.decode(
|
||||||
|
ids_to_decode,
|
||||||
|
space_between_special_tokens=False,
|
||||||
|
clean_up_tokenization_spaces=False,
|
||||||
|
),
|
||||||
|
f"Hint here are the tokens being decoded.: {slow_tokenizer.convert_ids_to_tokens(ids_to_decode)}",
|
||||||
|
)
|
||||||
|
|
||||||
# def test_encode_input_type(self):
|
# def test_encode_input_type(self):
|
||||||
# tokenizers = self.get_tokenizers(do_lower_case=False)
|
# tokenizers = self.get_tokenizers(do_lower_case=False)
|
||||||
# for tokenizer in tokenizers:
|
# for tokenizer in tokenizers:
|
||||||
|
Loading…
Reference in New Issue
Block a user