mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-03 04:40:06 +06:00
🚨 rm already deprecated pad_to_max_length arg (#37617)
* rm already deprecated padding max length * truncate_strategy AS AN ARG is already deprecated for a few years * fix * rm test_padding_to_max_length * rm pad_to_max_length=True in other tests * rm from common * missed fnet
This commit is contained in:
parent
7a3e208892
commit
c80f65265b
@ -2759,11 +2759,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
self, padding=False, truncation=None, max_length=None, pad_to_multiple_of=None, verbose=True, **kwargs
|
||||
):
|
||||
"""
|
||||
Find the correct padding/truncation strategy with backward compatibility for old arguments (truncation_strategy
|
||||
and pad_to_max_length) and behaviors.
|
||||
Find the correct padding/truncation strategy
|
||||
"""
|
||||
old_truncation_strategy = kwargs.pop("truncation_strategy", "do_not_truncate")
|
||||
old_pad_to_max_length = kwargs.pop("pad_to_max_length", False)
|
||||
|
||||
# Backward compatibility for previous behavior, maybe we should deprecate it:
|
||||
# If you only set max_length, it activates truncation for max_length
|
||||
@ -2781,21 +2778,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
truncation = "longest_first"
|
||||
|
||||
# Get padding strategy
|
||||
if padding is False and old_pad_to_max_length:
|
||||
if verbose:
|
||||
warnings.warn(
|
||||
"The `pad_to_max_length` argument is deprecated and will be removed in a future version, "
|
||||
"use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or "
|
||||
"use `padding='max_length'` to pad to a max length. In this case, you can give a specific "
|
||||
"length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the "
|
||||
"maximal input size of the model (e.g. 512 for Bert).",
|
||||
FutureWarning,
|
||||
)
|
||||
if max_length is None:
|
||||
padding_strategy = PaddingStrategy.LONGEST
|
||||
else:
|
||||
padding_strategy = PaddingStrategy.MAX_LENGTH
|
||||
elif padding is not False:
|
||||
if padding is not False:
|
||||
if padding is True:
|
||||
if verbose:
|
||||
if max_length is not None and (
|
||||
@ -2805,8 +2788,6 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
"`max_length` is ignored when `padding`=`True` and there is no truncation strategy. "
|
||||
"To pad to max length, use `padding='max_length'`."
|
||||
)
|
||||
if old_pad_to_max_length is not False:
|
||||
warnings.warn("Though `pad_to_max_length` = `True`, it is ignored because `padding`=`True`.")
|
||||
padding_strategy = PaddingStrategy.LONGEST # Default to pad to the longest sequence in the batch
|
||||
elif not isinstance(padding, PaddingStrategy):
|
||||
padding_strategy = PaddingStrategy(padding)
|
||||
@ -2816,21 +2797,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
padding_strategy = PaddingStrategy.DO_NOT_PAD
|
||||
|
||||
# Get truncation strategy
|
||||
if truncation is None and old_truncation_strategy != "do_not_truncate":
|
||||
if verbose:
|
||||
warnings.warn(
|
||||
"The `truncation_strategy` argument is deprecated and will be removed in a future version, use"
|
||||
" `truncation=True` to truncate examples to a max length. You can give a specific length with"
|
||||
" `max_length` (e.g. `max_length=45`) or leave max_length to None to truncate to the maximal input"
|
||||
" size of the model (e.g. 512 for Bert). If you have pairs of inputs, you can give a specific"
|
||||
" truncation strategy selected among `truncation='only_first'` (will only truncate the first"
|
||||
" sentence in the pairs) `truncation='only_second'` (will only truncate the second sentence in the"
|
||||
" pairs) or `truncation='longest_first'` (will iteratively remove tokens from the longest sentence"
|
||||
" in the pairs).",
|
||||
FutureWarning,
|
||||
)
|
||||
truncation_strategy = TruncationStrategy(old_truncation_strategy)
|
||||
elif truncation is not False and truncation is not None:
|
||||
if truncation is not False and truncation is not None:
|
||||
if truncation is True:
|
||||
truncation_strategy = (
|
||||
TruncationStrategy.LONGEST_FIRST
|
||||
@ -3146,7 +3113,6 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
method).
|
||||
"""
|
||||
|
||||
# Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
|
||||
padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
|
||||
padding=padding,
|
||||
truncation=truncation,
|
||||
|
@ -1074,10 +1074,10 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
|
||||
|
||||
def _convert_to_features(example_batch):
|
||||
input_encodings = tokenizer.batch_encode_plus(
|
||||
example_batch["input_text"], pad_to_max_length=True, max_length=512, truncation=True
|
||||
example_batch["input_text"], padding="max_length", max_length=512, truncation=True
|
||||
)
|
||||
target_encodings = tokenizer.batch_encode_plus(
|
||||
example_batch["target_text"], pad_to_max_length=True, max_length=16, truncation=True
|
||||
example_batch["target_text"], padding="max_length", max_length=16, truncation=True
|
||||
)
|
||||
|
||||
encodings = {
|
||||
|
@ -829,7 +829,6 @@ class BertEncoderDecoderModelTest(EncoderDecoderMixin, unittest.TestCase):
|
||||
input_dict = tokenizer(
|
||||
[ARTICLE_SIGMA, ARTICLE_AMERICA],
|
||||
padding="max_length",
|
||||
pad_to_max_length=True,
|
||||
max_length=512,
|
||||
return_tensors="pt",
|
||||
)
|
||||
|
@ -205,9 +205,6 @@ class FNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
pad_token_id = tokenizer_p.pad_token_id
|
||||
|
||||
# Encode - Simple input
|
||||
input_r = tokenizer_r.encode("This is a simple input", max_length=max_length, pad_to_max_length=True)
|
||||
input_p = tokenizer_p.encode("This is a simple input", max_length=max_length, pad_to_max_length=True)
|
||||
self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
|
||||
input_r = tokenizer_r.encode("This is a simple input", max_length=max_length, padding="max_length")
|
||||
input_p = tokenizer_p.encode("This is a simple input", max_length=max_length, padding="max_length")
|
||||
self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
|
||||
@ -217,13 +214,6 @@ class FNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
self.assert_padded_input_match(input_r, input_p, len(input_r), pad_token_id)
|
||||
|
||||
# Encode - Pair input
|
||||
input_r = tokenizer_r.encode(
|
||||
"This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True
|
||||
)
|
||||
input_p = tokenizer_p.encode(
|
||||
"This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True
|
||||
)
|
||||
self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
|
||||
input_r = tokenizer_r.encode(
|
||||
"This is a simple input", "This is a pair", max_length=max_length, padding="max_length"
|
||||
)
|
||||
@ -236,14 +226,6 @@ class FNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
self.assert_padded_input_match(input_r, input_p, len(input_r), pad_token_id)
|
||||
|
||||
# Encode_plus - Simple input
|
||||
input_r = tokenizer_r.encode_plus(
|
||||
"This is a simple input", max_length=max_length, pad_to_max_length=True
|
||||
)
|
||||
input_p = tokenizer_p.encode_plus(
|
||||
"This is a simple input", max_length=max_length, pad_to_max_length=True
|
||||
)
|
||||
self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
|
||||
|
||||
input_r = tokenizer_r.encode_plus(
|
||||
"This is a simple input", max_length=max_length, padding="max_length"
|
||||
)
|
||||
@ -259,14 +241,6 @@ class FNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
)
|
||||
|
||||
# Encode_plus - Pair input
|
||||
input_r = tokenizer_r.encode_plus(
|
||||
"This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True
|
||||
)
|
||||
input_p = tokenizer_p.encode_plus(
|
||||
"This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True
|
||||
)
|
||||
self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
|
||||
|
||||
input_r = tokenizer_r.encode_plus(
|
||||
"This is a simple input", "This is a pair", max_length=max_length, padding="max_length"
|
||||
)
|
||||
@ -282,18 +256,6 @@ class FNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
)
|
||||
|
||||
# Batch_encode_plus - Simple input
|
||||
input_r = tokenizer_r.batch_encode_plus(
|
||||
["This is a simple input 1", "This is a simple input 2"],
|
||||
max_length=max_length,
|
||||
pad_to_max_length=True,
|
||||
)
|
||||
input_p = tokenizer_p.batch_encode_plus(
|
||||
["This is a simple input 1", "This is a simple input 2"],
|
||||
max_length=max_length,
|
||||
pad_to_max_length=True,
|
||||
)
|
||||
self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id)
|
||||
|
||||
input_r = tokenizer_r.batch_encode_plus(
|
||||
["This is a simple input 1", "This is a simple input 2"],
|
||||
max_length=max_length,
|
||||
|
@ -566,41 +566,6 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
tokenizer.num_special_tokens_to_add(pair=True), len(attached_sequences) - len(sequences)
|
||||
)
|
||||
|
||||
def test_padding_to_max_length(self):
|
||||
"""We keep this test for backward compatibility but it should be removed when `pad_to_max_length` will be deprecated"""
|
||||
tokenizers = self.get_tokenizers(do_lower_case=False)
|
||||
for tokenizer in tokenizers:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
words, boxes = self.get_words_and_boxes()
|
||||
padding_size = 10
|
||||
|
||||
# check correct behaviour if no pad_token_id exists and add it eventually
|
||||
self._check_no_pad_token_padding(tokenizer, words)
|
||||
|
||||
padding_idx = tokenizer.pad_token_id
|
||||
|
||||
# Check that it correctly pads when a maximum length is specified along with the padding flag set to True
|
||||
tokenizer.padding_side = "right"
|
||||
encoded_sequence = tokenizer.encode(words, boxes=boxes)
|
||||
sequence_length = len(encoded_sequence)
|
||||
# FIXME: the next line should be padding(max_length) to avoid warning
|
||||
padded_sequence = tokenizer.encode(
|
||||
words, boxes=boxes, max_length=sequence_length + padding_size, pad_to_max_length=True
|
||||
)
|
||||
padded_sequence_length = len(padded_sequence)
|
||||
assert sequence_length + padding_size == padded_sequence_length
|
||||
assert encoded_sequence + [padding_idx] * padding_size == padded_sequence
|
||||
|
||||
# Check that nothing is done when a maximum length is not specified
|
||||
encoded_sequence = tokenizer.encode(words, boxes=boxes)
|
||||
sequence_length = len(encoded_sequence)
|
||||
|
||||
tokenizer.padding_side = "right"
|
||||
padded_sequence_right = tokenizer.encode(words, boxes=boxes, pad_to_max_length=True)
|
||||
padded_sequence_right_length = len(padded_sequence_right)
|
||||
assert sequence_length == padded_sequence_right_length
|
||||
assert encoded_sequence == padded_sequence_right
|
||||
|
||||
def test_padding(self, max_length=50):
|
||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||
@ -612,9 +577,6 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
|
||||
# Encode - Simple input
|
||||
words, boxes = self.get_words_and_boxes()
|
||||
input_r = tokenizer_r.encode(words, boxes=boxes, max_length=max_length, pad_to_max_length=True)
|
||||
input_p = tokenizer_p.encode(words, boxes=boxes, max_length=max_length, pad_to_max_length=True)
|
||||
self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
|
||||
input_r = tokenizer_r.encode(words, boxes=boxes, max_length=max_length, padding="max_length")
|
||||
input_p = tokenizer_p.encode(words, boxes=boxes, max_length=max_length, padding="max_length")
|
||||
self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
|
||||
@ -625,13 +587,6 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
|
||||
# Encode - Pair input
|
||||
question, words, boxes = self.get_question_words_and_boxes()
|
||||
input_r = tokenizer_r.encode(
|
||||
question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True
|
||||
)
|
||||
input_p = tokenizer_p.encode(
|
||||
question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True
|
||||
)
|
||||
self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
|
||||
input_r = tokenizer_r.encode(question, words, boxes=boxes, max_length=max_length, padding="max_length")
|
||||
input_p = tokenizer_p.encode(question, words, boxes=boxes, max_length=max_length, padding="max_length")
|
||||
self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
|
||||
@ -641,10 +596,6 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
|
||||
# Encode_plus - Simple input
|
||||
words, boxes = self.get_words_and_boxes()
|
||||
input_r = tokenizer_r.encode_plus(words, boxes=boxes, max_length=max_length, pad_to_max_length=True)
|
||||
input_p = tokenizer_p.encode_plus(words, boxes=boxes, max_length=max_length, pad_to_max_length=True)
|
||||
self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
|
||||
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
|
||||
input_r = tokenizer_r.encode_plus(words, boxes=boxes, max_length=max_length, padding="max_length")
|
||||
input_p = tokenizer_p.encode_plus(words, boxes=boxes, max_length=max_length, padding="max_length")
|
||||
self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
|
||||
@ -660,14 +611,6 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
|
||||
# Encode_plus - Pair input
|
||||
question, words, boxes = self.get_question_words_and_boxes()
|
||||
input_r = tokenizer_r.encode_plus(
|
||||
question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True
|
||||
)
|
||||
input_p = tokenizer_p.encode_plus(
|
||||
question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True
|
||||
)
|
||||
self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
|
||||
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
|
||||
input_r = tokenizer_r.encode_plus(
|
||||
question, words, boxes=boxes, max_length=max_length, padding="max_length"
|
||||
)
|
||||
@ -686,20 +629,6 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
# Batch_encode_plus - Simple input
|
||||
words, boxes = self.get_words_and_boxes_batch()
|
||||
|
||||
input_r = tokenizer_r.batch_encode_plus(
|
||||
words,
|
||||
boxes=boxes,
|
||||
max_length=max_length,
|
||||
pad_to_max_length=True,
|
||||
)
|
||||
input_p = tokenizer_p.batch_encode_plus(
|
||||
words,
|
||||
boxes=boxes,
|
||||
max_length=max_length,
|
||||
pad_to_max_length=True,
|
||||
)
|
||||
self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id)
|
||||
|
||||
input_r = tokenizer_r.batch_encode_plus(
|
||||
words,
|
||||
boxes=boxes,
|
||||
|
@ -460,41 +460,6 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
tokenizer.num_special_tokens_to_add(pair=True), len(attached_sequences) - len(sequences)
|
||||
)
|
||||
|
||||
def test_padding_to_max_length(self):
|
||||
"""We keep this test for backward compatibility but it should be removed when `pad_to_max_length` will be deprecated"""
|
||||
tokenizers = self.get_tokenizers(do_lower_case=False)
|
||||
for tokenizer in tokenizers:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
words, boxes = self.get_words_and_boxes()
|
||||
padding_size = 10
|
||||
|
||||
# check correct behaviour if no pad_token_id exists and add it eventually
|
||||
self._check_no_pad_token_padding(tokenizer, words)
|
||||
|
||||
padding_idx = tokenizer.pad_token_id
|
||||
|
||||
# Check that it correctly pads when a maximum length is specified along with the padding flag set to True
|
||||
tokenizer.padding_side = "right"
|
||||
encoded_sequence = tokenizer.encode(words, boxes=boxes)
|
||||
sequence_length = len(encoded_sequence)
|
||||
# FIXME: the next line should be padding(max_length) to avoid warning
|
||||
padded_sequence = tokenizer.encode(
|
||||
words, boxes=boxes, max_length=sequence_length + padding_size, pad_to_max_length=True
|
||||
)
|
||||
padded_sequence_length = len(padded_sequence)
|
||||
assert sequence_length + padding_size == padded_sequence_length
|
||||
assert encoded_sequence + [padding_idx] * padding_size == padded_sequence
|
||||
|
||||
# Check that nothing is done when a maximum length is not specified
|
||||
encoded_sequence = tokenizer.encode(words, boxes=boxes)
|
||||
sequence_length = len(encoded_sequence)
|
||||
|
||||
tokenizer.padding_side = "right"
|
||||
padded_sequence_right = tokenizer.encode(words, boxes=boxes, pad_to_max_length=True)
|
||||
padded_sequence_right_length = len(padded_sequence_right)
|
||||
assert sequence_length == padded_sequence_right_length
|
||||
assert encoded_sequence == padded_sequence_right
|
||||
|
||||
def test_padding(self, max_length=50):
|
||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||
@ -506,9 +471,6 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
|
||||
# Encode - Simple input
|
||||
words, boxes = self.get_words_and_boxes()
|
||||
input_r = tokenizer_r.encode(words, boxes=boxes, max_length=max_length, pad_to_max_length=True)
|
||||
input_p = tokenizer_p.encode(words, boxes=boxes, max_length=max_length, pad_to_max_length=True)
|
||||
self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
|
||||
input_r = tokenizer_r.encode(words, boxes=boxes, max_length=max_length, padding="max_length")
|
||||
input_p = tokenizer_p.encode(words, boxes=boxes, max_length=max_length, padding="max_length")
|
||||
self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
|
||||
@ -519,13 +481,6 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
|
||||
# Encode - Pair input
|
||||
question, words, boxes = self.get_question_words_and_boxes()
|
||||
input_r = tokenizer_r.encode(
|
||||
question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True
|
||||
)
|
||||
input_p = tokenizer_p.encode(
|
||||
question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True
|
||||
)
|
||||
self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
|
||||
input_r = tokenizer_r.encode(question, words, boxes=boxes, max_length=max_length, padding="max_length")
|
||||
input_p = tokenizer_p.encode(question, words, boxes=boxes, max_length=max_length, padding="max_length")
|
||||
self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
|
||||
@ -535,10 +490,6 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
|
||||
# Encode_plus - Simple input
|
||||
words, boxes = self.get_words_and_boxes()
|
||||
input_r = tokenizer_r.encode_plus(words, boxes=boxes, max_length=max_length, pad_to_max_length=True)
|
||||
input_p = tokenizer_p.encode_plus(words, boxes=boxes, max_length=max_length, pad_to_max_length=True)
|
||||
self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
|
||||
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
|
||||
input_r = tokenizer_r.encode_plus(words, boxes=boxes, max_length=max_length, padding="max_length")
|
||||
input_p = tokenizer_p.encode_plus(words, boxes=boxes, max_length=max_length, padding="max_length")
|
||||
self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
|
||||
@ -554,14 +505,6 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
|
||||
# Encode_plus - Pair input
|
||||
question, words, boxes = self.get_question_words_and_boxes()
|
||||
input_r = tokenizer_r.encode_plus(
|
||||
question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True
|
||||
)
|
||||
input_p = tokenizer_p.encode_plus(
|
||||
question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True
|
||||
)
|
||||
self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
|
||||
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
|
||||
input_r = tokenizer_r.encode_plus(
|
||||
question, words, boxes=boxes, max_length=max_length, padding="max_length"
|
||||
)
|
||||
@ -580,20 +523,6 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
# Batch_encode_plus - Simple input
|
||||
words, boxes = self.get_words_and_boxes_batch()
|
||||
|
||||
input_r = tokenizer_r.batch_encode_plus(
|
||||
words,
|
||||
boxes=boxes,
|
||||
max_length=max_length,
|
||||
pad_to_max_length=True,
|
||||
)
|
||||
input_p = tokenizer_p.batch_encode_plus(
|
||||
words,
|
||||
boxes=boxes,
|
||||
max_length=max_length,
|
||||
pad_to_max_length=True,
|
||||
)
|
||||
self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id)
|
||||
|
||||
input_r = tokenizer_r.batch_encode_plus(
|
||||
words,
|
||||
boxes=boxes,
|
||||
|
@ -497,41 +497,6 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
tokenizer.num_special_tokens_to_add(pair=True), len(attached_sequences) - len(sequences)
|
||||
)
|
||||
|
||||
def test_padding_to_max_length(self):
|
||||
"""We keep this test for backward compatibility but it should be removed when `pad_to_max_length` will be deprecated"""
|
||||
tokenizers = self.get_tokenizers(do_lower_case=False)
|
||||
for tokenizer in tokenizers:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
words, boxes = self.get_words_and_boxes()
|
||||
padding_size = 10
|
||||
|
||||
# check correct behaviour if no pad_token_id exists and add it eventually
|
||||
self._check_no_pad_token_padding(tokenizer, words)
|
||||
|
||||
padding_idx = tokenizer.pad_token_id
|
||||
|
||||
# Check that it correctly pads when a maximum length is specified along with the padding flag set to True
|
||||
tokenizer.padding_side = "right"
|
||||
encoded_sequence = tokenizer.encode(words, boxes=boxes)
|
||||
sequence_length = len(encoded_sequence)
|
||||
# FIXME: the next line should be padding(max_length) to avoid warning
|
||||
padded_sequence = tokenizer.encode(
|
||||
words, boxes=boxes, max_length=sequence_length + padding_size, pad_to_max_length=True
|
||||
)
|
||||
padded_sequence_length = len(padded_sequence)
|
||||
assert sequence_length + padding_size == padded_sequence_length
|
||||
assert encoded_sequence + [padding_idx] * padding_size == padded_sequence
|
||||
|
||||
# Check that nothing is done when a maximum length is not specified
|
||||
encoded_sequence = tokenizer.encode(words, boxes=boxes)
|
||||
sequence_length = len(encoded_sequence)
|
||||
|
||||
tokenizer.padding_side = "right"
|
||||
padded_sequence_right = tokenizer.encode(words, boxes=boxes, pad_to_max_length=True)
|
||||
padded_sequence_right_length = len(padded_sequence_right)
|
||||
assert sequence_length == padded_sequence_right_length
|
||||
assert encoded_sequence == padded_sequence_right
|
||||
|
||||
def test_padding(self, max_length=50):
|
||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||
@ -543,9 +508,6 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
|
||||
# Encode - Simple input
|
||||
words, boxes = self.get_words_and_boxes()
|
||||
input_r = tokenizer_r.encode(words, boxes=boxes, max_length=max_length, pad_to_max_length=True)
|
||||
input_p = tokenizer_p.encode(words, boxes=boxes, max_length=max_length, pad_to_max_length=True)
|
||||
self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
|
||||
input_r = tokenizer_r.encode(words, boxes=boxes, max_length=max_length, padding="max_length")
|
||||
input_p = tokenizer_p.encode(words, boxes=boxes, max_length=max_length, padding="max_length")
|
||||
self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
|
||||
@ -556,13 +518,6 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
|
||||
# Encode - Pair input
|
||||
question, words, boxes = self.get_question_words_and_boxes()
|
||||
input_r = tokenizer_r.encode(
|
||||
question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True
|
||||
)
|
||||
input_p = tokenizer_p.encode(
|
||||
question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True
|
||||
)
|
||||
self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
|
||||
input_r = tokenizer_r.encode(question, words, boxes=boxes, max_length=max_length, padding="max_length")
|
||||
input_p = tokenizer_p.encode(question, words, boxes=boxes, max_length=max_length, padding="max_length")
|
||||
self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
|
||||
@ -572,10 +527,6 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
|
||||
# Encode_plus - Simple input
|
||||
words, boxes = self.get_words_and_boxes()
|
||||
input_r = tokenizer_r.encode_plus(words, boxes=boxes, max_length=max_length, pad_to_max_length=True)
|
||||
input_p = tokenizer_p.encode_plus(words, boxes=boxes, max_length=max_length, pad_to_max_length=True)
|
||||
self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
|
||||
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
|
||||
input_r = tokenizer_r.encode_plus(words, boxes=boxes, max_length=max_length, padding="max_length")
|
||||
input_p = tokenizer_p.encode_plus(words, boxes=boxes, max_length=max_length, padding="max_length")
|
||||
self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
|
||||
@ -591,14 +542,6 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
|
||||
# Encode_plus - Pair input
|
||||
question, words, boxes = self.get_question_words_and_boxes()
|
||||
input_r = tokenizer_r.encode_plus(
|
||||
question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True
|
||||
)
|
||||
input_p = tokenizer_p.encode_plus(
|
||||
question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True
|
||||
)
|
||||
self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
|
||||
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
|
||||
input_r = tokenizer_r.encode_plus(
|
||||
question, words, boxes=boxes, max_length=max_length, padding="max_length"
|
||||
)
|
||||
@ -617,20 +560,6 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
# Batch_encode_plus - Simple input
|
||||
words, boxes = self.get_words_and_boxes_batch()
|
||||
|
||||
input_r = tokenizer_r.batch_encode_plus(
|
||||
words,
|
||||
boxes=boxes,
|
||||
max_length=max_length,
|
||||
pad_to_max_length=True,
|
||||
)
|
||||
input_p = tokenizer_p.batch_encode_plus(
|
||||
words,
|
||||
boxes=boxes,
|
||||
max_length=max_length,
|
||||
pad_to_max_length=True,
|
||||
)
|
||||
self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id)
|
||||
|
||||
input_r = tokenizer_r.batch_encode_plus(
|
||||
words,
|
||||
boxes=boxes,
|
||||
|
@ -382,41 +382,6 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
tokenizer.num_special_tokens_to_add(pair=True), len(attached_sequences) - len(sequences)
|
||||
)
|
||||
|
||||
def test_padding_to_max_length(self):
|
||||
"""We keep this test for backward compatibility but it should be removed when `pad_to_max_length` will be deprecated"""
|
||||
tokenizers = self.get_tokenizers(do_lower_case=False)
|
||||
for tokenizer in tokenizers:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
nodes, xpaths = self.get_nodes_and_xpaths()
|
||||
padding_size = 10
|
||||
|
||||
# check correct behaviour if no pad_token_id exists and add it eventually
|
||||
self._check_no_pad_token_padding(tokenizer, nodes)
|
||||
|
||||
padding_idx = tokenizer.pad_token_id
|
||||
|
||||
# Check that it correctly pads when a maximum length is specified along with the padding flag set to True
|
||||
tokenizer.padding_side = "right"
|
||||
encoded_sequence = tokenizer.encode(nodes, xpaths=xpaths)
|
||||
sequence_length = len(encoded_sequence)
|
||||
# FIXME: the next line should be padding(max_length) to avoid warning
|
||||
padded_sequence = tokenizer.encode(
|
||||
nodes, xpaths=xpaths, max_length=sequence_length + padding_size, pad_to_max_length=True
|
||||
)
|
||||
padded_sequence_length = len(padded_sequence)
|
||||
assert sequence_length + padding_size == padded_sequence_length
|
||||
assert encoded_sequence + [padding_idx] * padding_size == padded_sequence
|
||||
|
||||
# Check that nothing is done when a maximum length is not specified
|
||||
encoded_sequence = tokenizer.encode(nodes, xpaths=xpaths)
|
||||
sequence_length = len(encoded_sequence)
|
||||
|
||||
tokenizer.padding_side = "right"
|
||||
padded_sequence_right = tokenizer.encode(nodes, xpaths=xpaths, pad_to_max_length=True)
|
||||
padded_sequence_right_length = len(padded_sequence_right)
|
||||
assert sequence_length == padded_sequence_right_length
|
||||
assert encoded_sequence == padded_sequence_right
|
||||
|
||||
def test_padding(self, max_length=50):
|
||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||
@ -428,9 +393,6 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
|
||||
# Encode - Simple input
|
||||
nodes, xpaths = self.get_nodes_and_xpaths()
|
||||
input_r = tokenizer_r.encode(nodes, xpaths=xpaths, max_length=max_length, pad_to_max_length=True)
|
||||
input_p = tokenizer_p.encode(nodes, xpaths=xpaths, max_length=max_length, pad_to_max_length=True)
|
||||
self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
|
||||
input_r = tokenizer_r.encode(nodes, xpaths=xpaths, max_length=max_length, padding="max_length")
|
||||
input_p = tokenizer_p.encode(nodes, xpaths=xpaths, max_length=max_length, padding="max_length")
|
||||
self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
|
||||
@ -441,13 +403,6 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
|
||||
# Encode - Pair input
|
||||
question, nodes, xpaths = self.get_question_nodes_and_xpaths()
|
||||
input_r = tokenizer_r.encode(
|
||||
question, nodes, xpaths=xpaths, max_length=max_length, pad_to_max_length=True
|
||||
)
|
||||
input_p = tokenizer_p.encode(
|
||||
question, nodes, xpaths=xpaths, max_length=max_length, pad_to_max_length=True
|
||||
)
|
||||
self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
|
||||
input_r = tokenizer_r.encode(
|
||||
question, nodes, xpaths=xpaths, max_length=max_length, padding="max_length"
|
||||
)
|
||||
@ -461,10 +416,6 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
|
||||
# Encode_plus - Simple input
|
||||
nodes, xpaths = self.get_nodes_and_xpaths()
|
||||
input_r = tokenizer_r.encode_plus(nodes, xpaths=xpaths, max_length=max_length, pad_to_max_length=True)
|
||||
input_p = tokenizer_p.encode_plus(nodes, xpaths=xpaths, max_length=max_length, pad_to_max_length=True)
|
||||
self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
|
||||
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
|
||||
input_r = tokenizer_r.encode_plus(nodes, xpaths=xpaths, max_length=max_length, padding="max_length")
|
||||
input_p = tokenizer_p.encode_plus(nodes, xpaths=xpaths, max_length=max_length, padding="max_length")
|
||||
self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
|
||||
@ -480,14 +431,6 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
|
||||
# Encode_plus - Pair input
|
||||
question, nodes, xpaths = self.get_question_nodes_and_xpaths()
|
||||
input_r = tokenizer_r.encode_plus(
|
||||
question, nodes, xpaths=xpaths, max_length=max_length, pad_to_max_length=True
|
||||
)
|
||||
input_p = tokenizer_p.encode_plus(
|
||||
question, nodes, xpaths=xpaths, max_length=max_length, pad_to_max_length=True
|
||||
)
|
||||
self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
|
||||
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
|
||||
input_r = tokenizer_r.encode_plus(
|
||||
question, nodes, xpaths=xpaths, max_length=max_length, padding="max_length"
|
||||
)
|
||||
@ -506,20 +449,6 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
# Batch_encode_plus - Simple input
|
||||
nodes, xpaths = self.get_nodes_and_xpaths_batch()
|
||||
|
||||
input_r = tokenizer_r.batch_encode_plus(
|
||||
nodes,
|
||||
xpaths=xpaths,
|
||||
max_length=max_length,
|
||||
pad_to_max_length=True,
|
||||
)
|
||||
input_p = tokenizer_p.batch_encode_plus(
|
||||
nodes,
|
||||
xpaths=xpaths,
|
||||
max_length=max_length,
|
||||
pad_to_max_length=True,
|
||||
)
|
||||
self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id)
|
||||
|
||||
input_r = tokenizer_r.batch_encode_plus(
|
||||
nodes,
|
||||
xpaths=xpaths,
|
||||
|
@ -657,42 +657,6 @@ class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
tokenizer.num_special_tokens_to_add(pair=True), len(attached_sequences) - len(sequences)
|
||||
)
|
||||
|
||||
def test_padding_to_max_length(self):
|
||||
"""We keep this test for backward compatibility but it should be removed when `pad_to_max_length` will be deprecated"""
|
||||
tokenizers = self.get_tokenizers(do_lower_case=False)
|
||||
for tokenizer in tokenizers:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
table = self.get_table(tokenizer)
|
||||
sequence = "Sequence"
|
||||
padding_size = 10
|
||||
|
||||
# check correct behaviour if no pad_token_id exists and add it eventually
|
||||
self._check_no_pad_token_padding(tokenizer, sequence)
|
||||
|
||||
padding_idx = tokenizer.pad_token_id
|
||||
|
||||
# Check that it correctly pads when a maximum length is specified along with the padding flag set to True
|
||||
tokenizer.padding_side = "right"
|
||||
encoded_sequence = tokenizer.encode(table, sequence)
|
||||
sequence_length = len(encoded_sequence)
|
||||
# FIXME: the next line should be padding(max_length) to avoid warning
|
||||
padded_sequence = tokenizer.encode(
|
||||
table, sequence, max_length=sequence_length + padding_size, padding=True
|
||||
)
|
||||
padded_sequence_length = len(padded_sequence)
|
||||
assert sequence_length + padding_size == padded_sequence_length
|
||||
assert encoded_sequence + [padding_idx] * padding_size == padded_sequence
|
||||
|
||||
# Check that nothing is done when a maximum length is not specified
|
||||
encoded_sequence = tokenizer.encode(table, sequence)
|
||||
sequence_length = len(encoded_sequence)
|
||||
|
||||
tokenizer.padding_side = "right"
|
||||
padded_sequence_right = tokenizer.encode(table, sequence, pad_to_max_length=True)
|
||||
padded_sequence_right_length = len(padded_sequence_right)
|
||||
assert sequence_length == padded_sequence_right_length
|
||||
assert encoded_sequence == padded_sequence_right
|
||||
|
||||
def test_call(self):
|
||||
# Tests that all call wrap to encode_plus and batch_encode_plus
|
||||
tokenizers = self.get_tokenizers(do_lower_case=False)
|
||||
|
@ -417,41 +417,6 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
tokenizer.num_special_tokens_to_add(pair=True), len(attached_sequences) - len(sequences)
|
||||
)
|
||||
|
||||
def test_padding_to_max_length(self):
|
||||
"""We keep this test for backward compatibility but it should be removed when `pad_to_max_length` will be deprecated"""
|
||||
tokenizers = self.get_tokenizers(do_lower_case=False)
|
||||
for tokenizer in tokenizers:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
words, boxes = self.get_words_and_boxes()
|
||||
padding_size = 10
|
||||
|
||||
# check correct behaviour if no pad_token_id exists and add it eventually
|
||||
self._check_no_pad_token_padding(tokenizer, words)
|
||||
|
||||
padding_idx = tokenizer.pad_token_id
|
||||
|
||||
# Check that it correctly pads when a maximum length is specified along with the padding flag set to True
|
||||
tokenizer.padding_side = "right"
|
||||
encoded_sequence = tokenizer.encode_boxes(words, boxes=boxes)
|
||||
sequence_length = len(encoded_sequence)
|
||||
# FIXME: the next line should be padding(max_length) to avoid warning
|
||||
padded_sequence = tokenizer.encode_boxes(
|
||||
words, boxes=boxes, max_length=sequence_length + padding_size, pad_to_max_length=True
|
||||
)
|
||||
padded_sequence_length = len(padded_sequence)
|
||||
assert sequence_length + padding_size == padded_sequence_length
|
||||
assert encoded_sequence + [padding_idx] * padding_size == padded_sequence
|
||||
|
||||
# Check that nothing is done when a maximum length is not specified
|
||||
encoded_sequence = tokenizer.encode_boxes(words, boxes=boxes)
|
||||
sequence_length = len(encoded_sequence)
|
||||
|
||||
tokenizer.padding_side = "right"
|
||||
padded_sequence_right = tokenizer.encode_boxes(words, boxes=boxes, pad_to_max_length=True)
|
||||
padded_sequence_right_length = len(padded_sequence_right)
|
||||
assert sequence_length == padded_sequence_right_length
|
||||
assert encoded_sequence == padded_sequence_right
|
||||
|
||||
def test_padding(self, max_length=50):
|
||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||
@ -463,9 +428,6 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
|
||||
# Encode - Simple input
|
||||
words, boxes = self.get_words_and_boxes()
|
||||
input_r = tokenizer_r.encode_boxes(words, boxes=boxes, max_length=max_length, pad_to_max_length=True)
|
||||
input_p = tokenizer_p.encode_boxes(words, boxes=boxes, max_length=max_length, pad_to_max_length=True)
|
||||
self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
|
||||
input_r = tokenizer_r.encode_boxes(words, boxes=boxes, max_length=max_length, padding="max_length")
|
||||
input_p = tokenizer_p.encode_boxes(words, boxes=boxes, max_length=max_length, padding="max_length")
|
||||
self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
|
||||
@ -476,13 +438,6 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
|
||||
# Encode - Pair input
|
||||
question, words, boxes = self.get_question_words_and_boxes()
|
||||
input_r = tokenizer_r.encode_boxes(
|
||||
question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True
|
||||
)
|
||||
input_p = tokenizer_p.encode_boxes(
|
||||
question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True
|
||||
)
|
||||
self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
|
||||
input_r = tokenizer_r.encode_boxes(
|
||||
question, words, boxes=boxes, max_length=max_length, padding="max_length"
|
||||
)
|
||||
@ -496,14 +451,6 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
|
||||
# Encode_plus - Simple input
|
||||
words, boxes = self.get_words_and_boxes()
|
||||
input_r = tokenizer_r.encode_plus_boxes(
|
||||
words, boxes=boxes, max_length=max_length, pad_to_max_length=True
|
||||
)
|
||||
input_p = tokenizer_p.encode_plus_boxes(
|
||||
words, boxes=boxes, max_length=max_length, pad_to_max_length=True
|
||||
)
|
||||
self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
|
||||
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
|
||||
input_r = tokenizer_r.encode_plus_boxes(
|
||||
words, boxes=boxes, max_length=max_length, padding="max_length"
|
||||
)
|
||||
@ -523,14 +470,6 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
|
||||
# Encode_plus - Pair input
|
||||
question, words, boxes = self.get_question_words_and_boxes()
|
||||
input_r = tokenizer_r.encode_plus_boxes(
|
||||
question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True
|
||||
)
|
||||
input_p = tokenizer_p.encode_plus_boxes(
|
||||
question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True
|
||||
)
|
||||
self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
|
||||
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
|
||||
input_r = tokenizer_r.encode_plus_boxes(
|
||||
question, words, boxes=boxes, max_length=max_length, padding="max_length"
|
||||
)
|
||||
@ -549,20 +488,6 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
# Batch_encode_plus - Simple input
|
||||
words, boxes = self.get_words_and_boxes_batch()
|
||||
|
||||
input_r = tokenizer_r.batch_encode_plus_boxes(
|
||||
words,
|
||||
boxes=boxes,
|
||||
max_length=max_length,
|
||||
pad_to_max_length=True,
|
||||
)
|
||||
input_p = tokenizer_p.batch_encode_plus_boxes(
|
||||
words,
|
||||
boxes=boxes,
|
||||
max_length=max_length,
|
||||
pad_to_max_length=True,
|
||||
)
|
||||
self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id)
|
||||
|
||||
input_r = tokenizer_r.batch_encode_plus_boxes(
|
||||
words,
|
||||
boxes=boxes,
|
||||
|
@ -2475,41 +2475,6 @@ class TokenizerTesterMixin:
|
||||
self.assertEqual(sequence_length, truncated_sequence_left_length)
|
||||
self.assertEqual(encoded_sequence, truncated_sequence_left)
|
||||
|
||||
def test_padding_to_max_length(self):
|
||||
"""We keep this test for backward compatibility but it should be remove when `pad_to_max_length` is deprecated."""
|
||||
tokenizers = self.get_tokenizers(do_lower_case=False)
|
||||
for tokenizer in tokenizers:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||
sequence = "Sequence"
|
||||
padding_size = 10
|
||||
|
||||
# check correct behaviour if no pad_token_id exists and add it eventually
|
||||
self._check_no_pad_token_padding(tokenizer, sequence)
|
||||
|
||||
padding_idx = tokenizer.pad_token_id
|
||||
|
||||
# Check that it correctly pads when a maximum length is specified along with the padding flag set to True
|
||||
tokenizer.padding_side = "right"
|
||||
encoded_sequence = tokenizer.encode(sequence)
|
||||
sequence_length = len(encoded_sequence)
|
||||
# FIXME: the next line should be padding(max_length) to avoid warning
|
||||
padded_sequence = tokenizer.encode(
|
||||
sequence, max_length=sequence_length + padding_size, pad_to_max_length=True
|
||||
)
|
||||
padded_sequence_length = len(padded_sequence)
|
||||
self.assertEqual(sequence_length + padding_size, padded_sequence_length)
|
||||
self.assertEqual(encoded_sequence + [padding_idx] * padding_size, padded_sequence)
|
||||
|
||||
# Check that nothing is done when a maximum length is not specified
|
||||
encoded_sequence = tokenizer.encode(sequence)
|
||||
sequence_length = len(encoded_sequence)
|
||||
|
||||
tokenizer.padding_side = "right"
|
||||
padded_sequence_right = tokenizer.encode(sequence, pad_to_max_length=True)
|
||||
padded_sequence_right_length = len(padded_sequence_right)
|
||||
self.assertEqual(sequence_length, padded_sequence_right_length)
|
||||
self.assertEqual(encoded_sequence, padded_sequence_right)
|
||||
|
||||
def test_padding_to_multiple_of(self):
|
||||
tokenizers = self.get_tokenizers()
|
||||
for tokenizer in tokenizers:
|
||||
@ -3900,9 +3865,6 @@ class TokenizerTesterMixin:
|
||||
pad_token_id = tokenizer_p.pad_token_id
|
||||
|
||||
# Encode - Simple input
|
||||
input_r = tokenizer_r.encode("This is a simple input", max_length=max_length, pad_to_max_length=True)
|
||||
input_p = tokenizer_p.encode("This is a simple input", max_length=max_length, pad_to_max_length=True)
|
||||
self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
|
||||
input_r = tokenizer_r.encode("This is a simple input", max_length=max_length, padding="max_length")
|
||||
input_p = tokenizer_p.encode("This is a simple input", max_length=max_length, padding="max_length")
|
||||
self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
|
||||
@ -3912,13 +3874,6 @@ class TokenizerTesterMixin:
|
||||
self.assert_padded_input_match(input_r, input_p, len(input_r), pad_token_id)
|
||||
|
||||
# Encode - Pair input
|
||||
input_r = tokenizer_r.encode(
|
||||
"This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True
|
||||
)
|
||||
input_p = tokenizer_p.encode(
|
||||
"This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True
|
||||
)
|
||||
self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
|
||||
input_r = tokenizer_r.encode(
|
||||
"This is a simple input", "This is a pair", max_length=max_length, padding="max_length"
|
||||
)
|
||||
@ -3931,14 +3886,6 @@ class TokenizerTesterMixin:
|
||||
self.assert_padded_input_match(input_r, input_p, len(input_r), pad_token_id)
|
||||
|
||||
# Encode_plus - Simple input
|
||||
input_r = tokenizer_r.encode_plus(
|
||||
"This is a simple input", max_length=max_length, pad_to_max_length=True
|
||||
)
|
||||
input_p = tokenizer_p.encode_plus(
|
||||
"This is a simple input", max_length=max_length, pad_to_max_length=True
|
||||
)
|
||||
self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
|
||||
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
|
||||
input_r = tokenizer_r.encode_plus(
|
||||
"This is a simple input", max_length=max_length, padding="max_length"
|
||||
)
|
||||
@ -3957,14 +3904,6 @@ class TokenizerTesterMixin:
|
||||
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
|
||||
|
||||
# Encode_plus - Pair input
|
||||
input_r = tokenizer_r.encode_plus(
|
||||
"This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True
|
||||
)
|
||||
input_p = tokenizer_p.encode_plus(
|
||||
"This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True
|
||||
)
|
||||
self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
|
||||
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
|
||||
input_r = tokenizer_r.encode_plus(
|
||||
"This is a simple input", "This is a pair", max_length=max_length, padding="max_length"
|
||||
)
|
||||
@ -3981,18 +3920,6 @@ class TokenizerTesterMixin:
|
||||
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
|
||||
|
||||
# Batch_encode_plus - Simple input
|
||||
input_r = tokenizer_r.batch_encode_plus(
|
||||
["This is a simple input 1", "This is a simple input 2"],
|
||||
max_length=max_length,
|
||||
pad_to_max_length=True,
|
||||
)
|
||||
input_p = tokenizer_p.batch_encode_plus(
|
||||
["This is a simple input 1", "This is a simple input 2"],
|
||||
max_length=max_length,
|
||||
pad_to_max_length=True,
|
||||
)
|
||||
self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id)
|
||||
|
||||
input_r = tokenizer_r.batch_encode_plus(
|
||||
["This is a simple input 1", "This is a simple input 2"],
|
||||
max_length=max_length,
|
||||
|
Loading…
Reference in New Issue
Block a user