mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-19 04:28:26 +06:00

* Add return lengths * make pad a bit more flexible so it can be used as collate_fn * check all kwargs sent to encoding method are known * fixing kwargs in encodings * New AddedToken class in python This class let you specify specifique tokenization behaviors for some special tokens. Used in particular for GPT2 and Roberta, to control how white spaces are stripped around special tokens. * style and quality * switched to hugginface tokenizers library for AddedTokens * up to tokenizer 0.8.0-rc3 - update API to use AddedToken state * style and quality * do not raise an error on additional or unused kwargs for tokenize() but only a warning * transfo-xl pretrained model requires torch * Update src/transformers/tokenization_utils.py Co-authored-by: Lysandre Debut <lysandre@huggingface.co> Co-authored-by: Lysandre Debut <lysandre@huggingface.co>
157 lines
6.0 KiB
Python
157 lines
6.0 KiB
Python
# coding=utf-8
|
|
# Copyright 2018 The Google AI Language Team Authors.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
|
|
import json
|
|
import os
|
|
import unittest
|
|
|
|
from transformers.tokenization_roberta import VOCAB_FILES_NAMES, RobertaTokenizer, RobertaTokenizerFast
|
|
|
|
from .test_tokenization_common import TokenizerTesterMixin
|
|
from .utils import slow
|
|
|
|
|
|
class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|
tokenizer_class = RobertaTokenizer
|
|
|
|
def setUp(self):
|
|
super().setUp()
|
|
|
|
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
|
|
vocab = [
|
|
"l",
|
|
"o",
|
|
"w",
|
|
"e",
|
|
"r",
|
|
"s",
|
|
"t",
|
|
"i",
|
|
"d",
|
|
"n",
|
|
"\u0120",
|
|
"\u0120l",
|
|
"\u0120n",
|
|
"\u0120lo",
|
|
"\u0120low",
|
|
"er",
|
|
"\u0120lowest",
|
|
"\u0120newer",
|
|
"\u0120wider",
|
|
"<unk>",
|
|
]
|
|
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
|
merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
|
|
self.special_tokens_map = {"unk_token": "<unk>"}
|
|
|
|
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
|
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
|
|
with open(self.vocab_file, "w", encoding="utf-8") as fp:
|
|
fp.write(json.dumps(vocab_tokens) + "\n")
|
|
with open(self.merges_file, "w", encoding="utf-8") as fp:
|
|
fp.write("\n".join(merges))
|
|
|
|
def get_tokenizer(self, **kwargs):
|
|
kwargs.update(self.special_tokens_map)
|
|
return RobertaTokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
|
|
|
def get_rust_tokenizer(self, **kwargs):
|
|
kwargs.update(self.special_tokens_map)
|
|
return RobertaTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
|
|
|
|
def get_input_output_texts(self, tokenizer):
|
|
input_text = "lower newer"
|
|
output_text = "lower newer"
|
|
return input_text, output_text
|
|
|
|
def test_full_tokenizer(self):
|
|
tokenizer = RobertaTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
|
|
text = "lower newer"
|
|
bpe_tokens = ["l", "o", "w", "er", "\u0120", "n", "e", "w", "er"]
|
|
tokens = tokenizer.tokenize(text) # , add_prefix_space=True)
|
|
self.assertListEqual(tokens, bpe_tokens)
|
|
|
|
input_tokens = tokens + [tokenizer.unk_token]
|
|
input_bpe_tokens = [0, 1, 2, 15, 10, 9, 3, 2, 15, 19]
|
|
self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
|
|
|
|
def roberta_dict_integration_testing(self):
|
|
tokenizer = self.get_tokenizer()
|
|
|
|
self.assertListEqual(tokenizer.encode("Hello world!", add_special_tokens=False), [0, 31414, 232, 328, 2])
|
|
self.assertListEqual(
|
|
tokenizer.encode("Hello world! cécé herlolip 418", add_special_tokens=False),
|
|
[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2],
|
|
)
|
|
|
|
@slow
|
|
def test_sequence_builders(self):
|
|
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
|
|
|
|
text = tokenizer.encode("sequence builders", add_special_tokens=False)
|
|
text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
|
|
|
|
encoded_text_from_decode = tokenizer.encode(
|
|
"sequence builders", add_special_tokens=True, add_prefix_space=False
|
|
)
|
|
encoded_pair_from_decode = tokenizer.encode(
|
|
"sequence builders", "multi-sequence build", add_special_tokens=True, add_prefix_space=False
|
|
)
|
|
|
|
encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
|
|
encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
|
|
|
|
assert encoded_sentence == encoded_text_from_decode
|
|
assert encoded_pair == encoded_pair_from_decode
|
|
|
|
def test_space_encoding(self):
|
|
tokenizer = self.get_tokenizer()
|
|
|
|
sequence = "Encode this sequence."
|
|
space_encoding = tokenizer.byte_encoder[" ".encode("utf-8")[0]]
|
|
|
|
# Testing encoder arguments
|
|
encoded = tokenizer.encode(sequence, add_special_tokens=False, add_prefix_space=False)
|
|
first_char = tokenizer.convert_ids_to_tokens(encoded[0])[0]
|
|
self.assertNotEqual(first_char, space_encoding)
|
|
|
|
encoded = tokenizer.encode(sequence, add_special_tokens=False, add_prefix_space=True)
|
|
first_char = tokenizer.convert_ids_to_tokens(encoded[0])[0]
|
|
self.assertEqual(first_char, space_encoding)
|
|
|
|
tokenizer.add_special_tokens({"bos_token": "<s>"})
|
|
encoded = tokenizer.encode(sequence, add_special_tokens=True)
|
|
first_char = tokenizer.convert_ids_to_tokens(encoded[1])[0]
|
|
self.assertNotEqual(first_char, space_encoding)
|
|
|
|
# Testing spaces after special tokenss
|
|
mask = "<mask>"
|
|
tokenizer.add_special_tokens({"mask_token": mask})
|
|
mask_ind = tokenizer.convert_tokens_to_ids(mask)
|
|
|
|
sequence = "Encode <mask> sequence"
|
|
sequence_nospace = "Encode <mask>sequence"
|
|
|
|
encoded = tokenizer.encode(sequence)
|
|
mask_loc = encoded.index(mask_ind)
|
|
first_char = tokenizer.convert_ids_to_tokens(encoded[mask_loc + 1])[0]
|
|
self.assertEqual(first_char, space_encoding)
|
|
|
|
encoded = tokenizer.encode(sequence_nospace)
|
|
mask_loc = encoded.index(mask_ind)
|
|
first_char = tokenizer.convert_ids_to_tokens(encoded[mask_loc + 1])[0]
|
|
self.assertNotEqual(first_char, space_encoding)
|