transformers/tests/models/herbert/test_tokenization_herbert.py
Arthur ef7e93699a
[Tokenizer] Fix slow and fast serialization (#26570)
* fix

* last attempt

* current work

* fix forward compatibility

* save all special tokens

* current state

* revert additional changes

* updates

* remove tokenizer.model

* add a test and the fix

* nit

* revert one more break

* fix typefield issue

* quality

* more tests

* fix fields for FC

* more nits?

* new additional changes

* how

* some updates

* simplify all

* more nits

* revert some things to original

* nice

* nits

* a small hack

* more nits

* ahhaha

* fixup

* update

* make test run on ci

* use subtesting

* update

* Update .circleci/create_circleci_config.py

* updates

* fixup

* nits

* replace typo

* fix the test

* nits

* update

* None max dif pls

* a partial fix

* had to revert one thing

* test the fast

* updates

* fixup

* and more nits

* more fixes

* update

* Oupsy 👁️

* nits

* fix marian

* on our way to heaven

* Update src/transformers/models/t5/tokenization_t5.py

Co-authored-by: Lysandre Debut <hi@lysand.re>

* fixup

* Update src/transformers/tokenization_utils_fast.py

Co-authored-by: Leo Tronchon <leo.tronchon@gmail.com>

* Update src/transformers/tokenization_utils_base.py

Co-authored-by: Leo Tronchon <leo.tronchon@gmail.com>

* fix phobert

* skip some things, test more

* nits

* fixup

* fix deberta

* update

* update

* more updates

* skip one test

* more updates

* fix camembert

* can't test this one

* more good fixes

* kind of a major update

- seperate what is only done in fast in fast init and refactor
- add_token(AddedToken(..., speicla = True)) ignores it in fast
- better loading

* fixup

* more fixups

* fix pegasus and mpnet

* remove skipped tests

* fix phoneme tokenizer if self.verbose

* fix individual models

* update common tests

* update testing files

* all over again

* nits

* skip test for markup lm

* fixups

* fix order of addition in fast by sorting the added tokens decoder

* proper defaults for deberta

* correct default for fnet

* nits on add tokens, string initialized to special if special

* skip irrelevant herbert tests

* main fixes

* update test added_tokens_serialization

* the fix for bart like models and class instanciating

* update bart

* nit!

* update idefix test

* fix whisper!

* some fixup

* fixups

* revert some of the wrong chanegs

* fixup

* fixup

* skip marian

* skip the correct tests

* skip for tf and flax as well

---------

Co-authored-by: Lysandre Debut <hi@lysand.re>
Co-authored-by: Leo Tronchon <leo.tronchon@gmail.com>
2023-10-18 16:30:53 +02:00

140 lines
4.8 KiB
Python

# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors, Allegro.pl and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os
import unittest
from transformers import HerbertTokenizer, HerbertTokenizerFast
from transformers.models.herbert.tokenization_herbert import VOCAB_FILES_NAMES
from transformers.testing_utils import get_tests_dir, require_tokenizers, slow
from ...test_tokenization_common import TokenizerTesterMixin
@require_tokenizers
class HerbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = HerbertTokenizer
rust_tokenizer_class = HerbertTokenizerFast
test_rust_tokenizer = True
def setUp(self):
super().setUp()
# Use a simpler test file without japanese/chinese characters
with open(f"{get_tests_dir()}/fixtures/sample_text_no_unicode.txt", encoding="utf-8") as f_data:
self._data = f_data.read().replace("\n\n", "\n").strip()
vocab = [
"<s>",
"</s>",
"l",
"o",
"w",
"e",
"r",
"s",
"t",
"i",
"d",
"n",
"w</w>",
"r</w>",
"t</w>",
"lo",
"low",
"er</w>",
"low</w>",
"lowest</w>",
"newer</w>",
"wider</w>",
",</w>",
"<unk>",
]
vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""]
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(self.vocab_file, "w") as fp:
fp.write(json.dumps(vocab_tokens))
with open(self.merges_file, "w") as fp:
fp.write("\n".join(merges))
def get_input_output_texts(self, tokenizer):
input_text = "lower newer"
output_text = "lower newer"
return input_text, output_text
def test_full_tokenizer(self):
tokenizer = self.tokenizer_class(vocab_file=self.vocab_file, merges_file=self.merges_file)
text = "lower"
bpe_tokens = ["low", "er</w>"]
tokens = tokenizer.tokenize(text)
self.assertListEqual(tokens, bpe_tokens)
input_tokens = tokens + ["<unk>"]
input_bpe_tokens = [16, 17, 23]
self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
def test_rust_and_python_full_tokenizers(self):
if not self.test_rust_tokenizer:
return
tokenizer = self.get_tokenizer()
rust_tokenizer = self.get_rust_tokenizer()
sequence = "lower,newer"
tokens = tokenizer.tokenize(sequence)
rust_tokens = rust_tokenizer.tokenize(sequence)
self.assertListEqual(tokens, rust_tokens)
ids = tokenizer.encode(sequence, add_special_tokens=False)
rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
self.assertListEqual(ids, rust_ids)
rust_tokenizer = self.get_rust_tokenizer()
ids = tokenizer.encode(sequence)
rust_ids = rust_tokenizer.encode(sequence)
self.assertListEqual(ids, rust_ids)
@slow
def test_sequence_builders(self):
tokenizer = self.tokenizer_class.from_pretrained("allegro/herbert-base-cased")
text = tokenizer.encode("konstruowanie sekwencji", add_special_tokens=False)
text_2 = tokenizer.encode("konstruowanie wielu sekwencji", add_special_tokens=False)
encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
assert encoded_sentence == [0] + text + [2]
assert encoded_pair == [0] + text + [2] + text_2 + [2]
@unittest.skip(
"Test passes if run individually but not with the full tests (internal state of the tokenizer is modified). Will fix later"
)
def test_training_new_tokenizer_with_special_tokens_change(self):
pass
@unittest.skip(
"Test passes if run individually but not with the full tests (internal state of the tokenizer is modified). Will fix later"
)
def test_training_new_tokenizer(self):
pass