mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-05 13:50:13 +06:00

* Add TapexTokenizer * Improve docstrings and provide option to provide answer * Remove option for pretokenized inputs * Add TAPEX to README * Fix copies * Remove option for pretokenized inputs * Initial commit: add tapex fine-tuning examples on both table-based question answering and table-based fact verification. * - Draft a README file for running the script and introducing some background. - Remove unused code lines in tabfact script. - Disable the deafult `pad_to_max_length` option which is memory-consuming. * * Support `as_target_tokenizer` function for TapexTokenizer. * Fix the do_lower_case behaviour of TapexTokenizer. * Add unit tests for target scenarios and cased/uncased scenarios for both source and target. * * Replace the label BartTokenizer with TapexTokenizer's as_target_tokenizer function. * Fix typos in tapex example README. * * fix the evaluation script - remove the property `task_name` * * Make the label space more clear for tabfact tasks * * Using a new fine-tuning script for tapex-base on tabfact. * * Remove the lowercase code outside the tokenizer - we use the tokenizer to control whether do_lower_case * Guarantee the hyper-parameter can be run without out-of-memory on 16GB card and report the new reproduced number on wikisql * * Remove the default tokenizer_name option. * Provide evaluation command. * * Support for WikiTableQuestion dataset. * Fix a typo in README. * * Fix the datasets's key name in WikiTableQuestions * Run make fixup and move test to folder * Fix quality * Apply suggestions from code review * Apply suggestions from code review Co-authored-by: Suraj Patil <surajp815@gmail.com> * Apply suggestions from code review * Apply suggestions from code review Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Apply some more suggestions from code review * Improve docstrings * Overwrite failing test * Improve comment in example scripts * Fix rebase * Add TAPEX to Auto mapping * Add TAPEX to auto config mappings * Put TAPEX higher than BART in auto mapping * Add TAPEX to doc tests Co-authored-by: Niels Rogge <nielsrogge@Nielss-MBP.localdomain> Co-authored-by: SivilTaram <qianlxc@outlook.com> Co-authored-by: Niels Rogge <nielsrogge@nielss-mbp.home> Co-authored-by: Suraj Patil <surajp815@gmail.com> Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
910 lines
44 KiB
Python
910 lines
44 KiB
Python
# coding=utf-8
|
|
# Copyright 2022 The HuggingFace Team. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
|
|
import json
|
|
import os
|
|
import shutil
|
|
import tempfile
|
|
import unittest
|
|
from typing import List
|
|
|
|
import pandas as pd
|
|
|
|
from transformers import AddedToken, TapexTokenizer
|
|
from transformers.models.tapex.tokenization_tapex import VOCAB_FILES_NAMES
|
|
from transformers.testing_utils import is_pt_tf_cross_test, require_pandas, slow
|
|
|
|
from ..test_tokenization_common import TokenizerTesterMixin
|
|
|
|
|
|
@require_pandas
|
|
class TapexTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|
tokenizer_class = TapexTokenizer
|
|
test_rust_tokenizer = False
|
|
from_pretrained_kwargs = {"cls_token": "<s>"}
|
|
test_seq2seq = False
|
|
|
|
def setUp(self):
|
|
super().setUp()
|
|
|
|
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
|
|
# fmt: off
|
|
vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "\u0120", "\u0120l", "\u0120n", "\u0120lo", "\u0120low", "er", "\u0120lowest", "\u0120newer", "\u0120wider", "<unk>"] # noqa: E231
|
|
# fmt: on
|
|
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
|
merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
|
|
self.special_tokens_map = {"unk_token": "<unk>"}
|
|
|
|
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
|
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
|
|
with open(self.vocab_file, "w", encoding="utf-8") as fp:
|
|
fp.write(json.dumps(vocab_tokens) + "\n")
|
|
with open(self.merges_file, "w", encoding="utf-8") as fp:
|
|
fp.write("\n".join(merges))
|
|
|
|
def get_table(self, tokenizer, length=5):
|
|
toks = [tokenizer.decode([i], clean_up_tokenization_spaces=False) for i in range(len(tokenizer))]
|
|
|
|
if length == 0:
|
|
data = {}
|
|
else:
|
|
data = {toks[0]: [toks[tok] for tok in range(1, length)]}
|
|
|
|
table = pd.DataFrame.from_dict(data)
|
|
|
|
return table
|
|
|
|
def get_table_and_query(self, tokenizer, length=5):
|
|
toks = [tokenizer.decode([i], clean_up_tokenization_spaces=False) for i in range(len(tokenizer))]
|
|
table = self.get_table(tokenizer, length=length - 3)
|
|
query = " ".join(toks[:3])
|
|
|
|
return table, query
|
|
|
|
def get_clean_sequence(
|
|
self,
|
|
tokenizer,
|
|
with_prefix_space=False,
|
|
max_length=20,
|
|
min_length=5,
|
|
empty_table: bool = False,
|
|
add_special_tokens: bool = True,
|
|
return_table_and_query: bool = False,
|
|
):
|
|
|
|
toks = [tokenizer.decode([i], clean_up_tokenization_spaces=False) for i in range(len(tokenizer))]
|
|
|
|
if empty_table:
|
|
table = pd.DataFrame.from_dict({})
|
|
query = " ".join(toks[:min_length])
|
|
else:
|
|
data = {toks[0]: [toks[tok] for tok in range(1, min_length - 3)]}
|
|
table = pd.DataFrame.from_dict(data)
|
|
query = " ".join(toks[:3])
|
|
|
|
output_ids = tokenizer.encode(table, query, add_special_tokens=add_special_tokens)
|
|
output_txt = tokenizer.decode(output_ids)
|
|
|
|
if len(output_ids) < min_length:
|
|
raise ValueError("Update the code to generate the sequences so that they are larger")
|
|
if len(output_ids) > max_length:
|
|
raise ValueError("Update the code to generate the sequences so that they are smaller")
|
|
|
|
if return_table_and_query:
|
|
return output_txt, output_ids, table, query
|
|
|
|
return output_txt, output_ids
|
|
|
|
def get_tokenizer(self, **kwargs):
|
|
kwargs.update(self.special_tokens_map)
|
|
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
|
|
|
|
def get_input_output_texts(self, tokenizer):
|
|
input_text = "lower newer"
|
|
output_text = "lower newer"
|
|
return input_text, output_text
|
|
|
|
def test_full_tokenizer_roberta(self):
|
|
tokenizer = self.tokenizer_class(self.vocab_file, self.merges_file, **self.special_tokens_map)
|
|
text = "lower newer"
|
|
bpe_tokens = ["l", "o", "w", "er", "\u0120", "n", "e", "w", "er"]
|
|
tokens = tokenizer.tokenize(text)
|
|
self.assertListEqual(tokens, bpe_tokens)
|
|
|
|
input_tokens = tokens + [tokenizer.unk_token]
|
|
input_bpe_tokens = [0, 1, 2, 15, 10, 9, 3, 2, 15, 19]
|
|
self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
|
|
|
|
def roberta_dict_integration_testing(self):
|
|
tokenizer = self.get_tokenizer()
|
|
|
|
self.assertListEqual(tokenizer.encode("Hello world!", add_special_tokens=False), [0, 31414, 232, 328, 2])
|
|
self.assertListEqual(
|
|
tokenizer.encode("Hello world! cécé herlolip 418", add_special_tokens=False),
|
|
[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2],
|
|
)
|
|
|
|
def test_add_tokens_tokenizer(self):
|
|
tokenizers: List[TapexTokenizer] = self.get_tokenizers(do_lower_case=False)
|
|
for tokenizer in tokenizers:
|
|
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
|
table = self.get_table(tokenizer, length=0)
|
|
vocab_size = tokenizer.vocab_size
|
|
all_size = len(tokenizer)
|
|
|
|
self.assertNotEqual(vocab_size, 0)
|
|
|
|
# We usually have added tokens from the start in tests because our vocab fixtures are
|
|
# smaller than the original vocabs - let's not assert this
|
|
# self.assertEqual(vocab_size, all_size)
|
|
|
|
new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd"]
|
|
added_toks = tokenizer.add_tokens(new_toks)
|
|
vocab_size_2 = tokenizer.vocab_size
|
|
all_size_2 = len(tokenizer)
|
|
|
|
self.assertNotEqual(vocab_size_2, 0)
|
|
self.assertEqual(vocab_size, vocab_size_2)
|
|
self.assertEqual(added_toks, len(new_toks))
|
|
self.assertEqual(all_size_2, all_size + len(new_toks))
|
|
|
|
tokens = tokenizer.encode(table, "aaaaa bbbbbb low cccccccccdddddddd l", add_special_tokens=False)
|
|
|
|
self.assertGreaterEqual(len(tokens), 4)
|
|
self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
|
|
self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
|
|
|
|
new_toks_2 = {"eos_token": ">>>>|||<||<<|<<", "pad_token": "<<<<<|||>|>>>>|>"}
|
|
added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
|
|
vocab_size_3 = tokenizer.vocab_size
|
|
all_size_3 = len(tokenizer)
|
|
|
|
self.assertNotEqual(vocab_size_3, 0)
|
|
self.assertEqual(vocab_size, vocab_size_3)
|
|
self.assertEqual(added_toks_2, len(new_toks_2))
|
|
self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
|
|
|
|
tokens = tokenizer.encode(
|
|
table,
|
|
">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l",
|
|
add_special_tokens=False,
|
|
)
|
|
|
|
self.assertGreaterEqual(len(tokens), 6)
|
|
self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
|
|
self.assertGreater(tokens[0], tokens[1])
|
|
self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
|
|
self.assertGreater(tokens[-2], tokens[-3])
|
|
self.assertEqual(tokens[0], tokenizer.eos_token_id)
|
|
self.assertEqual(tokens[-2], tokenizer.pad_token_id)
|
|
|
|
def test_token_type_ids(self):
|
|
tokenizers = self.get_tokenizers()
|
|
for tokenizer in tokenizers:
|
|
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
|
empty_table = self.get_table(tokenizer, length=0)
|
|
seq_0 = "Test this method."
|
|
|
|
# We want to have sequence 0 and sequence 1 are tagged
|
|
# respectively with 0 and 1 token_ids
|
|
# (regardless of whether the model use token type ids)
|
|
# We use this assumption in the QA pipeline among other place
|
|
output = tokenizer(empty_table, seq_0, return_token_type_ids=True)
|
|
|
|
# Assert that the token type IDs have the same length as the input IDs
|
|
self.assertEqual(len(output["token_type_ids"]), len(output["input_ids"]))
|
|
self.assertIn(0, output["token_type_ids"])
|
|
|
|
def test_add_special_tokens(self):
|
|
tokenizers: List[TapexTokenizer] = self.get_tokenizers(do_lower_case=False)
|
|
for tokenizer in tokenizers:
|
|
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
|
input_table = self.get_table(tokenizer, length=0)
|
|
|
|
special_token = "[SPECIAL_TOKEN]"
|
|
|
|
tokenizer.add_special_tokens({"cls_token": special_token})
|
|
encoded_special_token = tokenizer.encode(input_table, special_token, add_special_tokens=False)
|
|
self.assertEqual(len(encoded_special_token), 1)
|
|
|
|
decoded = tokenizer.decode(encoded_special_token, skip_special_tokens=True)
|
|
self.assertTrue(special_token not in decoded)
|
|
|
|
def test_batch_encode_plus_overflowing_tokens(self):
|
|
tokenizers = self.get_tokenizers(do_lower_case=False)
|
|
for tokenizer in tokenizers:
|
|
table = self.get_table(tokenizer, length=10)
|
|
string_sequences = ["Testing the prepare_for_model method.", "Test"]
|
|
|
|
if tokenizer.pad_token is None:
|
|
tokenizer.add_special_tokens({"pad_token": "[PAD]"})
|
|
|
|
tokenizer.batch_encode_plus(
|
|
table, string_sequences, return_overflowing_tokens=True, truncation=True, padding=True, max_length=3
|
|
)
|
|
|
|
@is_pt_tf_cross_test
|
|
def test_batch_encode_plus_tensors(self):
|
|
tokenizers = self.get_tokenizers(do_lower_case=False)
|
|
for tokenizer in tokenizers:
|
|
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
|
sequences = [
|
|
"Testing batch encode plus",
|
|
"Testing batch encode plus with different sequence lengths",
|
|
"Testing batch encode plus with different sequence lengths correctly pads",
|
|
]
|
|
|
|
table = self.get_table(tokenizer, length=0)
|
|
|
|
# A Tensor cannot be build by sequences which are not the same size
|
|
self.assertRaises(ValueError, tokenizer.batch_encode_plus, table, sequences, return_tensors="pt")
|
|
self.assertRaises(ValueError, tokenizer.batch_encode_plus, table, sequences, return_tensors="tf")
|
|
|
|
if tokenizer.pad_token_id is None:
|
|
self.assertRaises(
|
|
ValueError,
|
|
tokenizer.batch_encode_plus,
|
|
table,
|
|
sequences,
|
|
padding=True,
|
|
return_tensors="pt",
|
|
)
|
|
self.assertRaises(
|
|
ValueError,
|
|
tokenizer.batch_encode_plus,
|
|
table,
|
|
sequences,
|
|
padding="longest",
|
|
return_tensors="tf",
|
|
)
|
|
else:
|
|
pytorch_tensor = tokenizer.batch_encode_plus(table, sequences, padding=True, return_tensors="pt")
|
|
tensorflow_tensor = tokenizer.batch_encode_plus(
|
|
table, sequences, padding="longest", return_tensors="tf"
|
|
)
|
|
encoded_sequences = tokenizer.batch_encode_plus(table, sequences, padding=True)
|
|
|
|
for key in encoded_sequences.keys():
|
|
pytorch_value = pytorch_tensor[key].tolist()
|
|
tensorflow_value = tensorflow_tensor[key].numpy().tolist()
|
|
encoded_value = encoded_sequences[key]
|
|
|
|
self.assertEqual(pytorch_value, tensorflow_value, encoded_value)
|
|
|
|
def test_call(self):
|
|
# Tests that all call wrap to encode_plus and batch_encode_plus
|
|
tokenizers = self.get_tokenizers(do_lower_case=False)
|
|
for tokenizer in tokenizers:
|
|
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
|
sequences = [
|
|
"Testing batch encode plus",
|
|
"Testing batch encode plus with different sequence lengths",
|
|
"Testing batch encode plus with different sequence lengths correctly pads",
|
|
]
|
|
|
|
# Test not batched
|
|
table = self.get_table(tokenizer, length=0)
|
|
encoded_sequences_1 = tokenizer.encode_plus(table, sequences[0])
|
|
encoded_sequences_2 = tokenizer(table, sequences[0])
|
|
self.assertEqual(encoded_sequences_1, encoded_sequences_2)
|
|
|
|
# Test not batched pairs
|
|
table = self.get_table(tokenizer, length=10)
|
|
encoded_sequences_1 = tokenizer.encode_plus(table, sequences[1])
|
|
encoded_sequences_2 = tokenizer(table, sequences[1])
|
|
self.assertEqual(encoded_sequences_1, encoded_sequences_2)
|
|
|
|
# Test batched
|
|
table = self.get_table(tokenizer, length=0)
|
|
encoded_sequences_1 = tokenizer.batch_encode_plus(table, sequences)
|
|
encoded_sequences_2 = tokenizer(table, sequences)
|
|
self.assertEqual(encoded_sequences_1, encoded_sequences_2)
|
|
|
|
def test_internal_consistency(self):
|
|
tokenizers = self.get_tokenizers()
|
|
for tokenizer in tokenizers:
|
|
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
|
table = self.get_table(tokenizer, length=0)
|
|
input_text, output_text = self.get_input_output_texts(tokenizer)
|
|
|
|
tokens = tokenizer.tokenize(input_text)
|
|
ids = tokenizer.convert_tokens_to_ids(tokens)
|
|
ids_2 = tokenizer.encode(table, input_text, add_special_tokens=False)
|
|
self.assertListEqual(ids, ids_2)
|
|
|
|
tokens_2 = tokenizer.convert_ids_to_tokens(ids)
|
|
self.assertNotEqual(len(tokens_2), 0)
|
|
text_2 = tokenizer.decode(ids)
|
|
self.assertIsInstance(text_2, str)
|
|
|
|
self.assertEqual(text_2, output_text)
|
|
|
|
def test_save_and_load_tokenizer(self):
|
|
# safety check on max_len default value so we are sure the test works
|
|
tokenizers = self.get_tokenizers()
|
|
for tokenizer in tokenizers:
|
|
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
|
self.assertNotEqual(tokenizer.model_max_length, 42)
|
|
|
|
# Now let's start the test
|
|
tokenizers = self.get_tokenizers()
|
|
for tokenizer in tokenizers:
|
|
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
|
# Isolate this from the other tests because we save additional tokens/etc
|
|
table = self.get_table(tokenizer, length=0)
|
|
tmpdirname = tempfile.mkdtemp()
|
|
|
|
sample_text = " He is very happy, UNwant\u00E9d,running"
|
|
before_tokens = tokenizer.encode(table, sample_text, add_special_tokens=False)
|
|
before_vocab = tokenizer.get_vocab()
|
|
tokenizer.save_pretrained(tmpdirname)
|
|
|
|
after_tokenizer = tokenizer.__class__.from_pretrained(tmpdirname)
|
|
after_tokens = after_tokenizer.encode(table, sample_text, add_special_tokens=False)
|
|
after_vocab = after_tokenizer.get_vocab()
|
|
self.assertListEqual(before_tokens, after_tokens)
|
|
self.assertDictEqual(before_vocab, after_vocab)
|
|
|
|
shutil.rmtree(tmpdirname)
|
|
|
|
def test_number_of_added_tokens(self):
|
|
tokenizers = self.get_tokenizers(do_lower_case=False)
|
|
for tokenizer in tokenizers:
|
|
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
|
|
|
table, query = self.get_table_and_query(tokenizer)
|
|
|
|
sequences = tokenizer.encode(table, query, add_special_tokens=False)
|
|
attached_sequences = tokenizer.encode(table, query, add_special_tokens=True)
|
|
|
|
self.assertEqual(2, len(attached_sequences) - len(sequences))
|
|
|
|
@unittest.skip("TAPEX cannot handle `prepare_for_model` without passing by `encode_plus` or `batch_encode_plus`")
|
|
def test_prepare_for_model(self):
|
|
pass
|
|
|
|
@unittest.skip("TAPEX tokenizer does not support pairs.")
|
|
def test_maximum_encoding_length_pair_input(self):
|
|
pass
|
|
|
|
@unittest.skip("TAPEX tokenizer does not support pairs.")
|
|
def test_maximum_encoding_length_single_input(self):
|
|
pass
|
|
|
|
@unittest.skip("Not implemented")
|
|
def test_right_and_left_truncation(self):
|
|
pass
|
|
|
|
def test_encode_decode_with_spaces(self):
|
|
tokenizers = self.get_tokenizers(do_lower_case=False)
|
|
for tokenizer in tokenizers:
|
|
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
|
table = self.get_table(tokenizer, length=0)
|
|
|
|
new_toks = [AddedToken("[ABC]", normalized=False), AddedToken("[DEF]", normalized=False)]
|
|
tokenizer.add_tokens(new_toks)
|
|
input = "[ABC][DEF][ABC][DEF]"
|
|
if self.space_between_special_tokens:
|
|
output = "[ABC] [DEF] [ABC] [DEF]"
|
|
else:
|
|
output = input
|
|
encoded = tokenizer.encode(table, input, add_special_tokens=False)
|
|
decoded = tokenizer.decode(encoded, spaces_between_special_tokens=self.space_between_special_tokens)
|
|
self.assertIn(decoded, [output, output.lower()])
|
|
|
|
def test_tokenize_special_tokens(self):
|
|
"""Test `tokenize` with special tokens."""
|
|
tokenizers = self.get_tokenizers(fast=True, do_lower_case=True)
|
|
for tokenizer in tokenizers:
|
|
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
|
SPECIAL_TOKEN_1 = "[SPECIAL_TOKEN_1]"
|
|
SPECIAL_TOKEN_2 = "[SPECIAL_TOKEN_2]"
|
|
|
|
# TODO:
|
|
# Can we combine `unique_no_split_tokens` and `all_special_tokens`(and properties related to it)
|
|
# with one variable(property) for a better maintainability?
|
|
|
|
# `add_tokens` method stores special tokens only in `tokenizer.unique_no_split_tokens`. (in tokenization_utils.py)
|
|
tokenizer.add_tokens([SPECIAL_TOKEN_1], special_tokens=True)
|
|
# `add_special_tokens` method stores special tokens in `tokenizer.additional_special_tokens`,
|
|
# which also occur in `tokenizer.all_special_tokens`. (in tokenization_utils_base.py)
|
|
tokenizer.add_special_tokens({"additional_special_tokens": [SPECIAL_TOKEN_2]})
|
|
|
|
token_1 = tokenizer.tokenize(SPECIAL_TOKEN_1)
|
|
token_2 = tokenizer.tokenize(SPECIAL_TOKEN_2)
|
|
|
|
self.assertEqual(len(token_1), 1)
|
|
self.assertEqual(len(token_2), 1)
|
|
self.assertEqual(token_1[0], SPECIAL_TOKEN_1)
|
|
self.assertEqual(token_2[0], SPECIAL_TOKEN_2)
|
|
|
|
def test_special_tokens_mask(self):
|
|
tokenizers = self.get_tokenizers(do_lower_case=False)
|
|
for tokenizer in tokenizers:
|
|
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
|
table = self.get_table(tokenizer, length=0)
|
|
sequence_0 = "Encode this."
|
|
# Testing single inputs
|
|
encoded_sequence = tokenizer.encode(table, sequence_0, add_special_tokens=False)
|
|
encoded_sequence_dict = tokenizer.encode_plus(
|
|
table, sequence_0, add_special_tokens=True, return_special_tokens_mask=True
|
|
)
|
|
encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
|
|
special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
|
|
self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
|
|
|
|
filtered_sequence = [x for i, x in enumerate(encoded_sequence_w_special) if not special_tokens_mask[i]]
|
|
self.assertEqual(encoded_sequence, filtered_sequence)
|
|
|
|
def test_padding_to_max_length(self):
|
|
"""We keep this test for backward compatibility but it should be removed when `pad_to_max_length` will be deprecated"""
|
|
tokenizers = self.get_tokenizers(do_lower_case=False)
|
|
for tokenizer in tokenizers:
|
|
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
|
table = self.get_table(tokenizer)
|
|
sequence = "Sequence"
|
|
padding_size = 10
|
|
|
|
# check correct behaviour if no pad_token_id exists and add it eventually
|
|
self._check_no_pad_token_padding(tokenizer, sequence)
|
|
|
|
padding_idx = tokenizer.pad_token_id
|
|
|
|
# Check that it correctly pads when a maximum length is specified along with the padding flag set to True
|
|
tokenizer.padding_side = "right"
|
|
encoded_sequence = tokenizer.encode(table, sequence)
|
|
sequence_length = len(encoded_sequence)
|
|
padded_sequence = tokenizer.encode(
|
|
table,
|
|
sequence,
|
|
max_length=sequence_length + padding_size,
|
|
pad_to_max_length=True,
|
|
)
|
|
padded_sequence_length = len(padded_sequence)
|
|
self.assertEqual(sequence_length + padding_size, padded_sequence_length)
|
|
self.assertListEqual(encoded_sequence + [padding_idx] * padding_size, padded_sequence)
|
|
|
|
# Check that nothing is done when a maximum length is not specified
|
|
encoded_sequence = tokenizer.encode(table, sequence)
|
|
sequence_length = len(encoded_sequence)
|
|
|
|
tokenizer.padding_side = "right"
|
|
padded_sequence_right = tokenizer.encode(table, sequence, pad_to_max_length=True)
|
|
padded_sequence_right_length = len(padded_sequence_right)
|
|
self.assertEqual(sequence_length, padded_sequence_right_length)
|
|
self.assertListEqual(encoded_sequence, padded_sequence_right)
|
|
|
|
def test_padding_to_multiple_of(self):
|
|
tokenizers = self.get_tokenizers()
|
|
for tokenizer in tokenizers:
|
|
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
|
table = self.get_table(tokenizer, length=0)
|
|
if tokenizer.pad_token is None:
|
|
self.skipTest("No padding token.")
|
|
else:
|
|
empty_tokens = tokenizer(table, padding=True, pad_to_multiple_of=8)
|
|
normal_tokens = tokenizer(table, "This is a sample input", padding=True, pad_to_multiple_of=8)
|
|
for key, value in empty_tokens.items():
|
|
self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
|
|
for key, value in normal_tokens.items():
|
|
self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
|
|
|
|
normal_tokens = tokenizer(table, "This", pad_to_multiple_of=8)
|
|
for key, value in normal_tokens.items():
|
|
self.assertNotEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
|
|
|
|
# Should also work with truncation
|
|
normal_tokens = tokenizer(table, "This", padding=True, truncation=True, pad_to_multiple_of=8)
|
|
for key, value in normal_tokens.items():
|
|
self.assertEqual(len(value) % 8, 0, f"BatchEncoding.{key} is not multiple of 8")
|
|
|
|
def test_right_and_left_padding(self):
|
|
tokenizers = self.get_tokenizers(do_lower_case=False)
|
|
for tokenizer in tokenizers:
|
|
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
|
table = self.get_table(tokenizer, length=0)
|
|
sequence = "Sequence"
|
|
padding_size = 10
|
|
|
|
# check correct behaviour if no pad_token_id exists and add it eventually
|
|
self._check_no_pad_token_padding(tokenizer, sequence)
|
|
|
|
padding_idx = tokenizer.pad_token_id
|
|
|
|
# RIGHT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
|
|
tokenizer.padding_side = "right"
|
|
encoded_sequence = tokenizer.encode(table, sequence)
|
|
sequence_length = len(encoded_sequence)
|
|
padded_sequence = tokenizer.encode(
|
|
table, sequence, max_length=sequence_length + padding_size, padding="max_length"
|
|
)
|
|
padded_sequence_length = len(padded_sequence)
|
|
self.assertEqual(sequence_length + padding_size, padded_sequence_length)
|
|
self.assertListEqual(encoded_sequence + [padding_idx] * padding_size, padded_sequence)
|
|
|
|
# LEFT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
|
|
tokenizer.padding_side = "left"
|
|
encoded_sequence = tokenizer.encode(table, sequence)
|
|
sequence_length = len(encoded_sequence)
|
|
padded_sequence = tokenizer.encode(
|
|
table, sequence, max_length=sequence_length + padding_size, padding="max_length"
|
|
)
|
|
padded_sequence_length = len(padded_sequence)
|
|
self.assertEqual(sequence_length + padding_size, padded_sequence_length)
|
|
self.assertListEqual([padding_idx] * padding_size + encoded_sequence, padded_sequence)
|
|
|
|
# RIGHT & LEFT PADDING - Check that nothing is done for 'longest' and 'no_padding'
|
|
encoded_sequence = tokenizer.encode(table, sequence)
|
|
sequence_length = len(encoded_sequence)
|
|
|
|
tokenizer.padding_side = "right"
|
|
padded_sequence_right = tokenizer.encode(table, sequence, padding=True)
|
|
padded_sequence_right_length = len(padded_sequence_right)
|
|
self.assertEqual(sequence_length, padded_sequence_right_length)
|
|
self.assertListEqual(encoded_sequence, padded_sequence_right)
|
|
|
|
tokenizer.padding_side = "left"
|
|
padded_sequence_left = tokenizer.encode(table, sequence, padding="longest")
|
|
padded_sequence_left_length = len(padded_sequence_left)
|
|
self.assertEqual(sequence_length, padded_sequence_left_length)
|
|
self.assertListEqual(encoded_sequence, padded_sequence_left)
|
|
|
|
tokenizer.padding_side = "right"
|
|
padded_sequence_right = tokenizer.encode(table, sequence)
|
|
padded_sequence_right_length = len(padded_sequence_right)
|
|
self.assertEqual(sequence_length, padded_sequence_right_length)
|
|
self.assertListEqual(encoded_sequence, padded_sequence_right)
|
|
|
|
tokenizer.padding_side = "left"
|
|
padded_sequence_left = tokenizer.encode(table, sequence, padding=False)
|
|
padded_sequence_left_length = len(padded_sequence_left)
|
|
self.assertEqual(sequence_length, padded_sequence_left_length)
|
|
self.assertListEqual(encoded_sequence, padded_sequence_left)
|
|
|
|
def test_encode_plus_with_padding(self):
|
|
tokenizers = self.get_tokenizers(do_lower_case=False)
|
|
for tokenizer in tokenizers:
|
|
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
|
table = self.get_table(tokenizer, length=0)
|
|
sequence = "Sequence"
|
|
|
|
# check correct behaviour if no pad_token_id exists and add it eventually
|
|
self._check_no_pad_token_padding(tokenizer, sequence)
|
|
|
|
padding_size = 10
|
|
padding_idx = tokenizer.pad_token_id
|
|
token_type_padding_idx = tokenizer.pad_token_type_id
|
|
|
|
encoded_sequence = tokenizer.encode_plus(table, sequence, return_special_tokens_mask=True)
|
|
input_ids = encoded_sequence["input_ids"]
|
|
special_tokens_mask = encoded_sequence["special_tokens_mask"]
|
|
sequence_length = len(input_ids)
|
|
|
|
# Test 'longest' and 'no_padding' don't do anything
|
|
tokenizer.padding_side = "right"
|
|
|
|
not_padded_sequence = tokenizer.encode_plus(
|
|
table,
|
|
sequence,
|
|
padding=False,
|
|
return_special_tokens_mask=True,
|
|
)
|
|
not_padded_input_ids = not_padded_sequence["input_ids"]
|
|
|
|
not_padded_special_tokens_mask = not_padded_sequence["special_tokens_mask"]
|
|
not_padded_sequence_length = len(not_padded_input_ids)
|
|
|
|
self.assertEqual(sequence_length, not_padded_sequence_length)
|
|
self.assertListEqual(input_ids, not_padded_input_ids)
|
|
self.assertListEqual(special_tokens_mask, not_padded_special_tokens_mask)
|
|
|
|
not_padded_sequence = tokenizer.encode_plus(
|
|
table,
|
|
sequence,
|
|
padding=False,
|
|
return_special_tokens_mask=True,
|
|
)
|
|
not_padded_input_ids = not_padded_sequence["input_ids"]
|
|
|
|
not_padded_special_tokens_mask = not_padded_sequence["special_tokens_mask"]
|
|
not_padded_sequence_length = len(not_padded_input_ids)
|
|
|
|
self.assertEqual(sequence_length, not_padded_sequence_length)
|
|
self.assertListEqual(input_ids, not_padded_input_ids)
|
|
self.assertListEqual(special_tokens_mask, not_padded_special_tokens_mask)
|
|
|
|
# Test right padding
|
|
tokenizer.padding_side = "right"
|
|
|
|
right_padded_sequence = tokenizer.encode_plus(
|
|
table,
|
|
sequence,
|
|
max_length=sequence_length + padding_size,
|
|
padding="max_length",
|
|
return_special_tokens_mask=True,
|
|
)
|
|
right_padded_input_ids = right_padded_sequence["input_ids"]
|
|
|
|
right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"]
|
|
right_padded_sequence_length = len(right_padded_input_ids)
|
|
|
|
self.assertEqual(sequence_length + padding_size, right_padded_sequence_length)
|
|
self.assertListEqual(input_ids + [padding_idx] * padding_size, right_padded_input_ids)
|
|
self.assertListEqual(special_tokens_mask + [1] * padding_size, right_padded_special_tokens_mask)
|
|
|
|
# Test left padding
|
|
tokenizer.padding_side = "left"
|
|
left_padded_sequence = tokenizer.encode_plus(
|
|
table,
|
|
sequence,
|
|
max_length=sequence_length + padding_size,
|
|
padding="max_length",
|
|
return_special_tokens_mask=True,
|
|
)
|
|
left_padded_input_ids = left_padded_sequence["input_ids"]
|
|
left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"]
|
|
left_padded_sequence_length = len(left_padded_input_ids)
|
|
|
|
self.assertEqual(sequence_length + padding_size, left_padded_sequence_length)
|
|
self.assertListEqual([padding_idx] * padding_size + input_ids, left_padded_input_ids)
|
|
self.assertListEqual([1] * padding_size + special_tokens_mask, left_padded_special_tokens_mask)
|
|
|
|
if "token_type_ids" in tokenizer.model_input_names:
|
|
token_type_ids = encoded_sequence["token_type_ids"]
|
|
left_padded_token_type_ids = left_padded_sequence["token_type_ids"]
|
|
right_padded_token_type_ids = right_padded_sequence["token_type_ids"]
|
|
|
|
self.assertListEqual(
|
|
(token_type_ids + [[token_type_padding_idx] * 7] * padding_size, right_padded_token_type_ids)
|
|
)
|
|
self.assertListEqual(
|
|
[[token_type_padding_idx] * 7] * padding_size + token_type_ids, left_padded_token_type_ids
|
|
)
|
|
|
|
if "attention_mask" in tokenizer.model_input_names:
|
|
attention_mask = encoded_sequence["attention_mask"]
|
|
right_padded_attention_mask = right_padded_sequence["attention_mask"]
|
|
left_padded_attention_mask = left_padded_sequence["attention_mask"]
|
|
|
|
self.assertListEqual(attention_mask + [0] * padding_size, right_padded_attention_mask)
|
|
self.assertListEqual([0] * padding_size + attention_mask, left_padded_attention_mask)
|
|
|
|
def test_batch_encode_plus_padding(self):
|
|
# Test that padded sequences are equivalent between batch_encode_plus and encode_plus
|
|
|
|
# Right padding tests
|
|
tokenizers = self.get_tokenizers(do_lower_case=False)
|
|
for tokenizer in tokenizers:
|
|
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
|
table = self.get_table(tokenizer, length=0)
|
|
sequences = [
|
|
"Testing batch encode plus",
|
|
"Testing batch encode plus with different sequence lengths",
|
|
"Testing batch encode plus with different sequence lengths correctly pads",
|
|
]
|
|
|
|
max_length = 100
|
|
|
|
# check correct behaviour if no pad_token_id exists and add it eventually
|
|
self._check_no_pad_token_padding(tokenizer, sequences)
|
|
|
|
encoded_sequences = [
|
|
tokenizer.encode_plus(table, sequence, max_length=max_length, padding="max_length")
|
|
for sequence in sequences
|
|
]
|
|
encoded_sequences_batch = tokenizer.batch_encode_plus(
|
|
table, sequences, max_length=max_length, padding="max_length"
|
|
)
|
|
self.assertListEqual(
|
|
encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch)
|
|
)
|
|
|
|
# Left padding tests
|
|
tokenizers = self.get_tokenizers(do_lower_case=False)
|
|
for tokenizer in tokenizers:
|
|
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
|
tokenizer.padding_side = "left"
|
|
sequences = [
|
|
"Testing batch encode plus",
|
|
"Testing batch encode plus with different sequence lengths",
|
|
"Testing batch encode plus with different sequence lengths correctly pads",
|
|
]
|
|
|
|
max_length = 100
|
|
|
|
# check correct behaviour if no pad_token_id exists and add it eventually
|
|
self._check_no_pad_token_padding(tokenizer, sequences)
|
|
|
|
encoded_sequences = [
|
|
tokenizer.encode_plus(table, sequence, max_length=max_length, padding="max_length")
|
|
for sequence in sequences
|
|
]
|
|
encoded_sequences_batch = tokenizer.batch_encode_plus(
|
|
table, sequences, max_length=max_length, padding="max_length"
|
|
)
|
|
self.assertListEqual(
|
|
encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch)
|
|
)
|
|
|
|
def test_batch_encode_plus_batch_sequence_length(self):
|
|
# Tests that all encoded values have the correct size
|
|
tokenizers = self.get_tokenizers(do_lower_case=False)
|
|
for tokenizer in tokenizers:
|
|
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
|
table = self.get_table(tokenizer, length=0)
|
|
sequences = [
|
|
"Testing batch encode plus",
|
|
"Testing batch encode plus with different sequence lengths",
|
|
"Testing batch encode plus with different sequence lengths correctly pads",
|
|
]
|
|
|
|
encoded_sequences = [tokenizer.encode_plus(table, sequence) for sequence in sequences]
|
|
encoded_sequences_batch = tokenizer.batch_encode_plus(table, sequences, padding=False)
|
|
self.assertListEqual(
|
|
encoded_sequences, self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch)
|
|
)
|
|
|
|
maximum_length = len(
|
|
max([encoded_sequence["input_ids"] for encoded_sequence in encoded_sequences], key=len)
|
|
)
|
|
|
|
# check correct behaviour if no pad_token_id exists and add it eventually
|
|
self._check_no_pad_token_padding(tokenizer, sequences)
|
|
|
|
encoded_sequences_padded = [
|
|
tokenizer.encode_plus(table, sequence, max_length=maximum_length, padding="max_length")
|
|
for sequence in sequences
|
|
]
|
|
|
|
encoded_sequences_batch_padded = tokenizer.batch_encode_plus(table, sequences, padding=True)
|
|
self.assertListEqual(
|
|
encoded_sequences_padded,
|
|
self.convert_batch_encode_plus_format_to_encode_plus(encoded_sequences_batch_padded),
|
|
)
|
|
|
|
# check 'longest' is unsensitive to a max length
|
|
encoded_sequences_batch_padded_1 = tokenizer.batch_encode_plus(table, sequences, padding=True)
|
|
encoded_sequences_batch_padded_2 = tokenizer.batch_encode_plus(
|
|
table, sequences, max_length=maximum_length + 10, padding="longest"
|
|
)
|
|
for key in encoded_sequences_batch_padded_1.keys():
|
|
self.assertListEqual(
|
|
encoded_sequences_batch_padded_1[key],
|
|
encoded_sequences_batch_padded_2[key],
|
|
)
|
|
|
|
# check 'no_padding' is unsensitive to a max length
|
|
encoded_sequences_batch_padded_1 = tokenizer.batch_encode_plus(table, sequences, padding=False)
|
|
encoded_sequences_batch_padded_2 = tokenizer.batch_encode_plus(
|
|
table, sequences, max_length=maximum_length + 10, padding=False
|
|
)
|
|
for key in encoded_sequences_batch_padded_1.keys():
|
|
self.assertListEqual(
|
|
encoded_sequences_batch_padded_1[key],
|
|
encoded_sequences_batch_padded_2[key],
|
|
)
|
|
|
|
def test_special_tokens_mask_input_pairs(self):
|
|
tokenizers = self.get_tokenizers(do_lower_case=False)
|
|
for tokenizer in tokenizers:
|
|
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
|
sequence_0 = "Encode this."
|
|
empty_table = self.get_table(tokenizer, length=0)
|
|
table = self.get_table(tokenizer, length=10)
|
|
encoded_sequence = tokenizer.encode(empty_table, sequence_0, add_special_tokens=False)
|
|
number_of_tokens = len(encoded_sequence)
|
|
encoded_sequence += tokenizer.encode(table, "", add_special_tokens=False)
|
|
encoded_sequence_dict = tokenizer.encode_plus(
|
|
table,
|
|
sequence_0,
|
|
add_special_tokens=True,
|
|
return_special_tokens_mask=True,
|
|
)
|
|
encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
|
|
special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
|
|
self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
|
|
|
|
filtered_sequence = [
|
|
(x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)
|
|
]
|
|
# NOTE: as TAPEX adds a space between a table and a sequence, we need to remove it
|
|
# in order to have equivalent results with encoding an empty table or empty sequence
|
|
del filtered_sequence[number_of_tokens + 1]
|
|
filtered_sequence = [x for x in filtered_sequence if x is not None]
|
|
print("Encoded sequence:", encoded_sequence)
|
|
print("Filtered sequence:", filtered_sequence)
|
|
self.assertEqual(encoded_sequence, filtered_sequence)
|
|
|
|
@slow
|
|
def test_full_tokenizer(self):
|
|
question = "Greece held its last Summer Olympics in 2004"
|
|
table_dict = {
|
|
"header": ["Year", "City", "Country", "Nations"],
|
|
"rows": [
|
|
[1896, "Athens", "Greece", 14],
|
|
[1900, "Paris", "France", 24],
|
|
[1904, "St. Louis", "USA", 12],
|
|
[2004, "Athens", "Greece", 201],
|
|
[2008, "Beijing", "China", 204],
|
|
[2012, "London", "UK", 204],
|
|
],
|
|
}
|
|
table = pd.DataFrame.from_dict(table_dict["rows"])
|
|
table.columns = table_dict["header"]
|
|
|
|
tokenizer = TapexTokenizer.from_pretrained("microsoft/tapex-large-finetuned-wtq")
|
|
encoding = tokenizer(table, question)
|
|
|
|
# fmt: off
|
|
expected_results = {'input_ids': [0, 821, 5314, 1755, 547, 63, 94, 1035, 1021, 31434, 2857, 11, 4482, 11311, 4832, 76, 1721, 343, 1721, 247, 1721, 3949, 3236, 112, 4832, 42773, 1721, 23, 27859, 1721, 821, 5314, 1755, 1721, 501, 3236, 132, 4832, 23137, 1721, 2242, 354, 1721, 6664, 2389, 1721, 706, 3236, 155, 4832, 42224, 1721, 1690, 4, 26120, 354, 1721, 201, 102, 1721, 316, 3236, 204, 4832, 4482, 1721, 23, 27859, 1721, 821, 5314, 1755, 1721, 21458, 3236, 195, 4832, 2266, 1721, 28, 40049, 1721, 1855, 1243, 1721, 28325, 3236, 231, 4832, 1125, 1721, 784, 24639, 1721, 1717, 330, 1721, 28325, 2]}
|
|
# fmt: on
|
|
|
|
self.assertListEqual(encoding.input_ids, expected_results["input_ids"])
|
|
|
|
def test_tokenizer_as_target(self):
|
|
# by default the tokenizer do_lower_case
|
|
tokenizer = TapexTokenizer.from_pretrained("microsoft/tapex-base")
|
|
answer_text = "tapex is a good model!"
|
|
expected_src_tokens = [0, 90, 5776, 1178, 16, 10, 205, 1421, 328, 2]
|
|
with tokenizer.as_target_tokenizer():
|
|
answer_encoding = tokenizer(answer=answer_text)
|
|
self.assertListEqual(answer_encoding.input_ids, expected_src_tokens)
|
|
|
|
@slow
|
|
def test_tokenizer_lower_case(self):
|
|
cased_tokenizer = TapexTokenizer.from_pretrained("microsoft/tapex-base", do_lower_case=False)
|
|
uncased_tokenizer = TapexTokenizer.from_pretrained("microsoft/tapex-base", do_lower_case=True)
|
|
answer_text = "Beijing, London, Paris"
|
|
answer_text_lower = "beijing, london, paris"
|
|
|
|
with cased_tokenizer.as_target_tokenizer():
|
|
with uncased_tokenizer.as_target_tokenizer():
|
|
self.assertNotEqual(
|
|
cased_tokenizer(answer=answer_text).input_ids, uncased_tokenizer(answer=answer_text).input_ids
|
|
)
|
|
self.assertEqual(
|
|
cased_tokenizer(answer=answer_text_lower).input_ids,
|
|
uncased_tokenizer(answer=answer_text).input_ids,
|
|
)
|
|
# batched encoding assert
|
|
self.assertNotEqual(
|
|
cased_tokenizer(answer=[answer_text]).input_ids, uncased_tokenizer(answer=[answer_text]).input_ids
|
|
)
|
|
self.assertEqual(
|
|
cased_tokenizer(answer=[answer_text_lower]).input_ids,
|
|
uncased_tokenizer(answer=[answer_text]).input_ids,
|
|
)
|
|
# test input encoding lowercase
|
|
question = "Greece held its last Summer Olympics in 2004"
|
|
table_dict = {
|
|
"header": ["Year", "City", "Country", "Nations"],
|
|
"rows": [
|
|
[1896, "Athens", "Greece", 14],
|
|
[1900, "Paris", "France", 24],
|
|
[1904, "St. Louis", "USA", 12],
|
|
[2004, "Athens", "Greece", 201],
|
|
[2008, "Beijing", "China", 204],
|
|
[2012, "London", "UK", 204],
|
|
],
|
|
}
|
|
table = pd.DataFrame.from_dict(table_dict["rows"])
|
|
table.columns = table_dict["header"]
|
|
|
|
self.assertNotEqual(
|
|
cased_tokenizer(table=table, query=question).input_ids,
|
|
uncased_tokenizer(table=table, query=question).input_ids,
|
|
)
|