diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 1ab60a964f2..6635481ff91 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -2759,11 +2759,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): self, padding=False, truncation=None, max_length=None, pad_to_multiple_of=None, verbose=True, **kwargs ): """ - Find the correct padding/truncation strategy with backward compatibility for old arguments (truncation_strategy - and pad_to_max_length) and behaviors. + Find the correct padding/truncation strategy """ - old_truncation_strategy = kwargs.pop("truncation_strategy", "do_not_truncate") - old_pad_to_max_length = kwargs.pop("pad_to_max_length", False) # Backward compatibility for previous behavior, maybe we should deprecate it: # If you only set max_length, it activates truncation for max_length @@ -2781,21 +2778,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): truncation = "longest_first" # Get padding strategy - if padding is False and old_pad_to_max_length: - if verbose: - warnings.warn( - "The `pad_to_max_length` argument is deprecated and will be removed in a future version, " - "use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or " - "use `padding='max_length'` to pad to a max length. In this case, you can give a specific " - "length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the " - "maximal input size of the model (e.g. 512 for Bert).", - FutureWarning, - ) - if max_length is None: - padding_strategy = PaddingStrategy.LONGEST - else: - padding_strategy = PaddingStrategy.MAX_LENGTH - elif padding is not False: + if padding is not False: if padding is True: if verbose: if max_length is not None and ( @@ -2805,8 +2788,6 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): "`max_length` is ignored when `padding`=`True` and there is no truncation strategy. " "To pad to max length, use `padding='max_length'`." ) - if old_pad_to_max_length is not False: - warnings.warn("Though `pad_to_max_length` = `True`, it is ignored because `padding`=`True`.") padding_strategy = PaddingStrategy.LONGEST # Default to pad to the longest sequence in the batch elif not isinstance(padding, PaddingStrategy): padding_strategy = PaddingStrategy(padding) @@ -2816,21 +2797,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): padding_strategy = PaddingStrategy.DO_NOT_PAD # Get truncation strategy - if truncation is None and old_truncation_strategy != "do_not_truncate": - if verbose: - warnings.warn( - "The `truncation_strategy` argument is deprecated and will be removed in a future version, use" - " `truncation=True` to truncate examples to a max length. You can give a specific length with" - " `max_length` (e.g. `max_length=45`) or leave max_length to None to truncate to the maximal input" - " size of the model (e.g. 512 for Bert). If you have pairs of inputs, you can give a specific" - " truncation strategy selected among `truncation='only_first'` (will only truncate the first" - " sentence in the pairs) `truncation='only_second'` (will only truncate the second sentence in the" - " pairs) or `truncation='longest_first'` (will iteratively remove tokens from the longest sentence" - " in the pairs).", - FutureWarning, - ) - truncation_strategy = TruncationStrategy(old_truncation_strategy) - elif truncation is not False and truncation is not None: + if truncation is not False and truncation is not None: if truncation is True: truncation_strategy = ( TruncationStrategy.LONGEST_FIRST @@ -3146,7 +3113,6 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): method). """ - # Backward compatibility for 'truncation_strategy', 'pad_to_max_length' padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies( padding=padding, truncation=truncation, diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py index b2b96613e63..f07756f731f 100644 --- a/tests/deepspeed/test_deepspeed.py +++ b/tests/deepspeed/test_deepspeed.py @@ -1074,10 +1074,10 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T def _convert_to_features(example_batch): input_encodings = tokenizer.batch_encode_plus( - example_batch["input_text"], pad_to_max_length=True, max_length=512, truncation=True + example_batch["input_text"], padding="max_length", max_length=512, truncation=True ) target_encodings = tokenizer.batch_encode_plus( - example_batch["target_text"], pad_to_max_length=True, max_length=16, truncation=True + example_batch["target_text"], padding="max_length", max_length=16, truncation=True ) encodings = { diff --git a/tests/models/encoder_decoder/test_modeling_encoder_decoder.py b/tests/models/encoder_decoder/test_modeling_encoder_decoder.py index 18b7d0b6129..6df93374137 100644 --- a/tests/models/encoder_decoder/test_modeling_encoder_decoder.py +++ b/tests/models/encoder_decoder/test_modeling_encoder_decoder.py @@ -829,7 +829,6 @@ class BertEncoderDecoderModelTest(EncoderDecoderMixin, unittest.TestCase): input_dict = tokenizer( [ARTICLE_SIGMA, ARTICLE_AMERICA], padding="max_length", - pad_to_max_length=True, max_length=512, return_tensors="pt", ) diff --git a/tests/models/fnet/test_tokenization_fnet.py b/tests/models/fnet/test_tokenization_fnet.py index b70aa33e0a1..3efb764e18f 100644 --- a/tests/models/fnet/test_tokenization_fnet.py +++ b/tests/models/fnet/test_tokenization_fnet.py @@ -205,9 +205,6 @@ class FNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase): pad_token_id = tokenizer_p.pad_token_id # Encode - Simple input - input_r = tokenizer_r.encode("This is a simple input", max_length=max_length, pad_to_max_length=True) - input_p = tokenizer_p.encode("This is a simple input", max_length=max_length, pad_to_max_length=True) - self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id) input_r = tokenizer_r.encode("This is a simple input", max_length=max_length, padding="max_length") input_p = tokenizer_p.encode("This is a simple input", max_length=max_length, padding="max_length") self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id) @@ -217,13 +214,6 @@ class FNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase): self.assert_padded_input_match(input_r, input_p, len(input_r), pad_token_id) # Encode - Pair input - input_r = tokenizer_r.encode( - "This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True - ) - input_p = tokenizer_p.encode( - "This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True - ) - self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id) input_r = tokenizer_r.encode( "This is a simple input", "This is a pair", max_length=max_length, padding="max_length" ) @@ -236,14 +226,6 @@ class FNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase): self.assert_padded_input_match(input_r, input_p, len(input_r), pad_token_id) # Encode_plus - Simple input - input_r = tokenizer_r.encode_plus( - "This is a simple input", max_length=max_length, pad_to_max_length=True - ) - input_p = tokenizer_p.encode_plus( - "This is a simple input", max_length=max_length, pad_to_max_length=True - ) - self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id) - input_r = tokenizer_r.encode_plus( "This is a simple input", max_length=max_length, padding="max_length" ) @@ -259,14 +241,6 @@ class FNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ) # Encode_plus - Pair input - input_r = tokenizer_r.encode_plus( - "This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True - ) - input_p = tokenizer_p.encode_plus( - "This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True - ) - self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id) - input_r = tokenizer_r.encode_plus( "This is a simple input", "This is a pair", max_length=max_length, padding="max_length" ) @@ -282,18 +256,6 @@ class FNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ) # Batch_encode_plus - Simple input - input_r = tokenizer_r.batch_encode_plus( - ["This is a simple input 1", "This is a simple input 2"], - max_length=max_length, - pad_to_max_length=True, - ) - input_p = tokenizer_p.batch_encode_plus( - ["This is a simple input 1", "This is a simple input 2"], - max_length=max_length, - pad_to_max_length=True, - ) - self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id) - input_r = tokenizer_r.batch_encode_plus( ["This is a simple input 1", "This is a simple input 2"], max_length=max_length, diff --git a/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py b/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py index fcf46670dbe..6eb8abf0b50 100644 --- a/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py +++ b/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py @@ -566,41 +566,6 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer.num_special_tokens_to_add(pair=True), len(attached_sequences) - len(sequences) ) - def test_padding_to_max_length(self): - """We keep this test for backward compatibility but it should be removed when `pad_to_max_length` will be deprecated""" - tokenizers = self.get_tokenizers(do_lower_case=False) - for tokenizer in tokenizers: - with self.subTest(f"{tokenizer.__class__.__name__}"): - words, boxes = self.get_words_and_boxes() - padding_size = 10 - - # check correct behaviour if no pad_token_id exists and add it eventually - self._check_no_pad_token_padding(tokenizer, words) - - padding_idx = tokenizer.pad_token_id - - # Check that it correctly pads when a maximum length is specified along with the padding flag set to True - tokenizer.padding_side = "right" - encoded_sequence = tokenizer.encode(words, boxes=boxes) - sequence_length = len(encoded_sequence) - # FIXME: the next line should be padding(max_length) to avoid warning - padded_sequence = tokenizer.encode( - words, boxes=boxes, max_length=sequence_length + padding_size, pad_to_max_length=True - ) - padded_sequence_length = len(padded_sequence) - assert sequence_length + padding_size == padded_sequence_length - assert encoded_sequence + [padding_idx] * padding_size == padded_sequence - - # Check that nothing is done when a maximum length is not specified - encoded_sequence = tokenizer.encode(words, boxes=boxes) - sequence_length = len(encoded_sequence) - - tokenizer.padding_side = "right" - padded_sequence_right = tokenizer.encode(words, boxes=boxes, pad_to_max_length=True) - padded_sequence_right_length = len(padded_sequence_right) - assert sequence_length == padded_sequence_right_length - assert encoded_sequence == padded_sequence_right - def test_padding(self, max_length=50): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): @@ -612,9 +577,6 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): # Encode - Simple input words, boxes = self.get_words_and_boxes() - input_r = tokenizer_r.encode(words, boxes=boxes, max_length=max_length, pad_to_max_length=True) - input_p = tokenizer_p.encode(words, boxes=boxes, max_length=max_length, pad_to_max_length=True) - self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id) input_r = tokenizer_r.encode(words, boxes=boxes, max_length=max_length, padding="max_length") input_p = tokenizer_p.encode(words, boxes=boxes, max_length=max_length, padding="max_length") self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id) @@ -625,13 +587,6 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): # Encode - Pair input question, words, boxes = self.get_question_words_and_boxes() - input_r = tokenizer_r.encode( - question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True - ) - input_p = tokenizer_p.encode( - question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True - ) - self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id) input_r = tokenizer_r.encode(question, words, boxes=boxes, max_length=max_length, padding="max_length") input_p = tokenizer_p.encode(question, words, boxes=boxes, max_length=max_length, padding="max_length") self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id) @@ -641,10 +596,6 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): # Encode_plus - Simple input words, boxes = self.get_words_and_boxes() - input_r = tokenizer_r.encode_plus(words, boxes=boxes, max_length=max_length, pad_to_max_length=True) - input_p = tokenizer_p.encode_plus(words, boxes=boxes, max_length=max_length, pad_to_max_length=True) - self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id) - self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"]) input_r = tokenizer_r.encode_plus(words, boxes=boxes, max_length=max_length, padding="max_length") input_p = tokenizer_p.encode_plus(words, boxes=boxes, max_length=max_length, padding="max_length") self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id) @@ -660,14 +611,6 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): # Encode_plus - Pair input question, words, boxes = self.get_question_words_and_boxes() - input_r = tokenizer_r.encode_plus( - question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True - ) - input_p = tokenizer_p.encode_plus( - question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True - ) - self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id) - self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"]) input_r = tokenizer_r.encode_plus( question, words, boxes=boxes, max_length=max_length, padding="max_length" ) @@ -686,20 +629,6 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): # Batch_encode_plus - Simple input words, boxes = self.get_words_and_boxes_batch() - input_r = tokenizer_r.batch_encode_plus( - words, - boxes=boxes, - max_length=max_length, - pad_to_max_length=True, - ) - input_p = tokenizer_p.batch_encode_plus( - words, - boxes=boxes, - max_length=max_length, - pad_to_max_length=True, - ) - self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id) - input_r = tokenizer_r.batch_encode_plus( words, boxes=boxes, diff --git a/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py b/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py index a1989d814ad..6e5f1ee11a7 100644 --- a/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py +++ b/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py @@ -460,41 +460,6 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer.num_special_tokens_to_add(pair=True), len(attached_sequences) - len(sequences) ) - def test_padding_to_max_length(self): - """We keep this test for backward compatibility but it should be removed when `pad_to_max_length` will be deprecated""" - tokenizers = self.get_tokenizers(do_lower_case=False) - for tokenizer in tokenizers: - with self.subTest(f"{tokenizer.__class__.__name__}"): - words, boxes = self.get_words_and_boxes() - padding_size = 10 - - # check correct behaviour if no pad_token_id exists and add it eventually - self._check_no_pad_token_padding(tokenizer, words) - - padding_idx = tokenizer.pad_token_id - - # Check that it correctly pads when a maximum length is specified along with the padding flag set to True - tokenizer.padding_side = "right" - encoded_sequence = tokenizer.encode(words, boxes=boxes) - sequence_length = len(encoded_sequence) - # FIXME: the next line should be padding(max_length) to avoid warning - padded_sequence = tokenizer.encode( - words, boxes=boxes, max_length=sequence_length + padding_size, pad_to_max_length=True - ) - padded_sequence_length = len(padded_sequence) - assert sequence_length + padding_size == padded_sequence_length - assert encoded_sequence + [padding_idx] * padding_size == padded_sequence - - # Check that nothing is done when a maximum length is not specified - encoded_sequence = tokenizer.encode(words, boxes=boxes) - sequence_length = len(encoded_sequence) - - tokenizer.padding_side = "right" - padded_sequence_right = tokenizer.encode(words, boxes=boxes, pad_to_max_length=True) - padded_sequence_right_length = len(padded_sequence_right) - assert sequence_length == padded_sequence_right_length - assert encoded_sequence == padded_sequence_right - def test_padding(self, max_length=50): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): @@ -506,9 +471,6 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase): # Encode - Simple input words, boxes = self.get_words_and_boxes() - input_r = tokenizer_r.encode(words, boxes=boxes, max_length=max_length, pad_to_max_length=True) - input_p = tokenizer_p.encode(words, boxes=boxes, max_length=max_length, pad_to_max_length=True) - self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id) input_r = tokenizer_r.encode(words, boxes=boxes, max_length=max_length, padding="max_length") input_p = tokenizer_p.encode(words, boxes=boxes, max_length=max_length, padding="max_length") self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id) @@ -519,13 +481,6 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase): # Encode - Pair input question, words, boxes = self.get_question_words_and_boxes() - input_r = tokenizer_r.encode( - question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True - ) - input_p = tokenizer_p.encode( - question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True - ) - self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id) input_r = tokenizer_r.encode(question, words, boxes=boxes, max_length=max_length, padding="max_length") input_p = tokenizer_p.encode(question, words, boxes=boxes, max_length=max_length, padding="max_length") self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id) @@ -535,10 +490,6 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase): # Encode_plus - Simple input words, boxes = self.get_words_and_boxes() - input_r = tokenizer_r.encode_plus(words, boxes=boxes, max_length=max_length, pad_to_max_length=True) - input_p = tokenizer_p.encode_plus(words, boxes=boxes, max_length=max_length, pad_to_max_length=True) - self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id) - self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"]) input_r = tokenizer_r.encode_plus(words, boxes=boxes, max_length=max_length, padding="max_length") input_p = tokenizer_p.encode_plus(words, boxes=boxes, max_length=max_length, padding="max_length") self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id) @@ -554,14 +505,6 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase): # Encode_plus - Pair input question, words, boxes = self.get_question_words_and_boxes() - input_r = tokenizer_r.encode_plus( - question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True - ) - input_p = tokenizer_p.encode_plus( - question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True - ) - self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id) - self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"]) input_r = tokenizer_r.encode_plus( question, words, boxes=boxes, max_length=max_length, padding="max_length" ) @@ -580,20 +523,6 @@ class LayoutLMv3TokenizationTest(TokenizerTesterMixin, unittest.TestCase): # Batch_encode_plus - Simple input words, boxes = self.get_words_and_boxes_batch() - input_r = tokenizer_r.batch_encode_plus( - words, - boxes=boxes, - max_length=max_length, - pad_to_max_length=True, - ) - input_p = tokenizer_p.batch_encode_plus( - words, - boxes=boxes, - max_length=max_length, - pad_to_max_length=True, - ) - self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id) - input_r = tokenizer_r.batch_encode_plus( words, boxes=boxes, diff --git a/tests/models/layoutxlm/test_tokenization_layoutxlm.py b/tests/models/layoutxlm/test_tokenization_layoutxlm.py index 2bd76954d6d..056726f0047 100644 --- a/tests/models/layoutxlm/test_tokenization_layoutxlm.py +++ b/tests/models/layoutxlm/test_tokenization_layoutxlm.py @@ -497,41 +497,6 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer.num_special_tokens_to_add(pair=True), len(attached_sequences) - len(sequences) ) - def test_padding_to_max_length(self): - """We keep this test for backward compatibility but it should be removed when `pad_to_max_length` will be deprecated""" - tokenizers = self.get_tokenizers(do_lower_case=False) - for tokenizer in tokenizers: - with self.subTest(f"{tokenizer.__class__.__name__}"): - words, boxes = self.get_words_and_boxes() - padding_size = 10 - - # check correct behaviour if no pad_token_id exists and add it eventually - self._check_no_pad_token_padding(tokenizer, words) - - padding_idx = tokenizer.pad_token_id - - # Check that it correctly pads when a maximum length is specified along with the padding flag set to True - tokenizer.padding_side = "right" - encoded_sequence = tokenizer.encode(words, boxes=boxes) - sequence_length = len(encoded_sequence) - # FIXME: the next line should be padding(max_length) to avoid warning - padded_sequence = tokenizer.encode( - words, boxes=boxes, max_length=sequence_length + padding_size, pad_to_max_length=True - ) - padded_sequence_length = len(padded_sequence) - assert sequence_length + padding_size == padded_sequence_length - assert encoded_sequence + [padding_idx] * padding_size == padded_sequence - - # Check that nothing is done when a maximum length is not specified - encoded_sequence = tokenizer.encode(words, boxes=boxes) - sequence_length = len(encoded_sequence) - - tokenizer.padding_side = "right" - padded_sequence_right = tokenizer.encode(words, boxes=boxes, pad_to_max_length=True) - padded_sequence_right_length = len(padded_sequence_right) - assert sequence_length == padded_sequence_right_length - assert encoded_sequence == padded_sequence_right - def test_padding(self, max_length=50): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): @@ -543,9 +508,6 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): # Encode - Simple input words, boxes = self.get_words_and_boxes() - input_r = tokenizer_r.encode(words, boxes=boxes, max_length=max_length, pad_to_max_length=True) - input_p = tokenizer_p.encode(words, boxes=boxes, max_length=max_length, pad_to_max_length=True) - self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id) input_r = tokenizer_r.encode(words, boxes=boxes, max_length=max_length, padding="max_length") input_p = tokenizer_p.encode(words, boxes=boxes, max_length=max_length, padding="max_length") self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id) @@ -556,13 +518,6 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): # Encode - Pair input question, words, boxes = self.get_question_words_and_boxes() - input_r = tokenizer_r.encode( - question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True - ) - input_p = tokenizer_p.encode( - question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True - ) - self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id) input_r = tokenizer_r.encode(question, words, boxes=boxes, max_length=max_length, padding="max_length") input_p = tokenizer_p.encode(question, words, boxes=boxes, max_length=max_length, padding="max_length") self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id) @@ -572,10 +527,6 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): # Encode_plus - Simple input words, boxes = self.get_words_and_boxes() - input_r = tokenizer_r.encode_plus(words, boxes=boxes, max_length=max_length, pad_to_max_length=True) - input_p = tokenizer_p.encode_plus(words, boxes=boxes, max_length=max_length, pad_to_max_length=True) - self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id) - self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"]) input_r = tokenizer_r.encode_plus(words, boxes=boxes, max_length=max_length, padding="max_length") input_p = tokenizer_p.encode_plus(words, boxes=boxes, max_length=max_length, padding="max_length") self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id) @@ -591,14 +542,6 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): # Encode_plus - Pair input question, words, boxes = self.get_question_words_and_boxes() - input_r = tokenizer_r.encode_plus( - question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True - ) - input_p = tokenizer_p.encode_plus( - question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True - ) - self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id) - self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"]) input_r = tokenizer_r.encode_plus( question, words, boxes=boxes, max_length=max_length, padding="max_length" ) @@ -617,20 +560,6 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): # Batch_encode_plus - Simple input words, boxes = self.get_words_and_boxes_batch() - input_r = tokenizer_r.batch_encode_plus( - words, - boxes=boxes, - max_length=max_length, - pad_to_max_length=True, - ) - input_p = tokenizer_p.batch_encode_plus( - words, - boxes=boxes, - max_length=max_length, - pad_to_max_length=True, - ) - self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id) - input_r = tokenizer_r.batch_encode_plus( words, boxes=boxes, diff --git a/tests/models/markuplm/test_tokenization_markuplm.py b/tests/models/markuplm/test_tokenization_markuplm.py index 3e49516cd63..3cdbd4acf83 100644 --- a/tests/models/markuplm/test_tokenization_markuplm.py +++ b/tests/models/markuplm/test_tokenization_markuplm.py @@ -382,41 +382,6 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer.num_special_tokens_to_add(pair=True), len(attached_sequences) - len(sequences) ) - def test_padding_to_max_length(self): - """We keep this test for backward compatibility but it should be removed when `pad_to_max_length` will be deprecated""" - tokenizers = self.get_tokenizers(do_lower_case=False) - for tokenizer in tokenizers: - with self.subTest(f"{tokenizer.__class__.__name__}"): - nodes, xpaths = self.get_nodes_and_xpaths() - padding_size = 10 - - # check correct behaviour if no pad_token_id exists and add it eventually - self._check_no_pad_token_padding(tokenizer, nodes) - - padding_idx = tokenizer.pad_token_id - - # Check that it correctly pads when a maximum length is specified along with the padding flag set to True - tokenizer.padding_side = "right" - encoded_sequence = tokenizer.encode(nodes, xpaths=xpaths) - sequence_length = len(encoded_sequence) - # FIXME: the next line should be padding(max_length) to avoid warning - padded_sequence = tokenizer.encode( - nodes, xpaths=xpaths, max_length=sequence_length + padding_size, pad_to_max_length=True - ) - padded_sequence_length = len(padded_sequence) - assert sequence_length + padding_size == padded_sequence_length - assert encoded_sequence + [padding_idx] * padding_size == padded_sequence - - # Check that nothing is done when a maximum length is not specified - encoded_sequence = tokenizer.encode(nodes, xpaths=xpaths) - sequence_length = len(encoded_sequence) - - tokenizer.padding_side = "right" - padded_sequence_right = tokenizer.encode(nodes, xpaths=xpaths, pad_to_max_length=True) - padded_sequence_right_length = len(padded_sequence_right) - assert sequence_length == padded_sequence_right_length - assert encoded_sequence == padded_sequence_right - def test_padding(self, max_length=50): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): @@ -428,9 +393,6 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): # Encode - Simple input nodes, xpaths = self.get_nodes_and_xpaths() - input_r = tokenizer_r.encode(nodes, xpaths=xpaths, max_length=max_length, pad_to_max_length=True) - input_p = tokenizer_p.encode(nodes, xpaths=xpaths, max_length=max_length, pad_to_max_length=True) - self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id) input_r = tokenizer_r.encode(nodes, xpaths=xpaths, max_length=max_length, padding="max_length") input_p = tokenizer_p.encode(nodes, xpaths=xpaths, max_length=max_length, padding="max_length") self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id) @@ -441,13 +403,6 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): # Encode - Pair input question, nodes, xpaths = self.get_question_nodes_and_xpaths() - input_r = tokenizer_r.encode( - question, nodes, xpaths=xpaths, max_length=max_length, pad_to_max_length=True - ) - input_p = tokenizer_p.encode( - question, nodes, xpaths=xpaths, max_length=max_length, pad_to_max_length=True - ) - self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id) input_r = tokenizer_r.encode( question, nodes, xpaths=xpaths, max_length=max_length, padding="max_length" ) @@ -461,10 +416,6 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): # Encode_plus - Simple input nodes, xpaths = self.get_nodes_and_xpaths() - input_r = tokenizer_r.encode_plus(nodes, xpaths=xpaths, max_length=max_length, pad_to_max_length=True) - input_p = tokenizer_p.encode_plus(nodes, xpaths=xpaths, max_length=max_length, pad_to_max_length=True) - self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id) - self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"]) input_r = tokenizer_r.encode_plus(nodes, xpaths=xpaths, max_length=max_length, padding="max_length") input_p = tokenizer_p.encode_plus(nodes, xpaths=xpaths, max_length=max_length, padding="max_length") self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id) @@ -480,14 +431,6 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): # Encode_plus - Pair input question, nodes, xpaths = self.get_question_nodes_and_xpaths() - input_r = tokenizer_r.encode_plus( - question, nodes, xpaths=xpaths, max_length=max_length, pad_to_max_length=True - ) - input_p = tokenizer_p.encode_plus( - question, nodes, xpaths=xpaths, max_length=max_length, pad_to_max_length=True - ) - self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id) - self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"]) input_r = tokenizer_r.encode_plus( question, nodes, xpaths=xpaths, max_length=max_length, padding="max_length" ) @@ -506,20 +449,6 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): # Batch_encode_plus - Simple input nodes, xpaths = self.get_nodes_and_xpaths_batch() - input_r = tokenizer_r.batch_encode_plus( - nodes, - xpaths=xpaths, - max_length=max_length, - pad_to_max_length=True, - ) - input_p = tokenizer_p.batch_encode_plus( - nodes, - xpaths=xpaths, - max_length=max_length, - pad_to_max_length=True, - ) - self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id) - input_r = tokenizer_r.batch_encode_plus( nodes, xpaths=xpaths, diff --git a/tests/models/tapas/test_tokenization_tapas.py b/tests/models/tapas/test_tokenization_tapas.py index 103e8428d60..25ae9528111 100644 --- a/tests/models/tapas/test_tokenization_tapas.py +++ b/tests/models/tapas/test_tokenization_tapas.py @@ -657,42 +657,6 @@ class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer.num_special_tokens_to_add(pair=True), len(attached_sequences) - len(sequences) ) - def test_padding_to_max_length(self): - """We keep this test for backward compatibility but it should be removed when `pad_to_max_length` will be deprecated""" - tokenizers = self.get_tokenizers(do_lower_case=False) - for tokenizer in tokenizers: - with self.subTest(f"{tokenizer.__class__.__name__}"): - table = self.get_table(tokenizer) - sequence = "Sequence" - padding_size = 10 - - # check correct behaviour if no pad_token_id exists and add it eventually - self._check_no_pad_token_padding(tokenizer, sequence) - - padding_idx = tokenizer.pad_token_id - - # Check that it correctly pads when a maximum length is specified along with the padding flag set to True - tokenizer.padding_side = "right" - encoded_sequence = tokenizer.encode(table, sequence) - sequence_length = len(encoded_sequence) - # FIXME: the next line should be padding(max_length) to avoid warning - padded_sequence = tokenizer.encode( - table, sequence, max_length=sequence_length + padding_size, padding=True - ) - padded_sequence_length = len(padded_sequence) - assert sequence_length + padding_size == padded_sequence_length - assert encoded_sequence + [padding_idx] * padding_size == padded_sequence - - # Check that nothing is done when a maximum length is not specified - encoded_sequence = tokenizer.encode(table, sequence) - sequence_length = len(encoded_sequence) - - tokenizer.padding_side = "right" - padded_sequence_right = tokenizer.encode(table, sequence, pad_to_max_length=True) - padded_sequence_right_length = len(padded_sequence_right) - assert sequence_length == padded_sequence_right_length - assert encoded_sequence == padded_sequence_right - def test_call(self): # Tests that all call wrap to encode_plus and batch_encode_plus tokenizers = self.get_tokenizers(do_lower_case=False) diff --git a/tests/models/udop/test_tokenization_udop.py b/tests/models/udop/test_tokenization_udop.py index 73c36944d3d..581cee32e21 100644 --- a/tests/models/udop/test_tokenization_udop.py +++ b/tests/models/udop/test_tokenization_udop.py @@ -417,41 +417,6 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer.num_special_tokens_to_add(pair=True), len(attached_sequences) - len(sequences) ) - def test_padding_to_max_length(self): - """We keep this test for backward compatibility but it should be removed when `pad_to_max_length` will be deprecated""" - tokenizers = self.get_tokenizers(do_lower_case=False) - for tokenizer in tokenizers: - with self.subTest(f"{tokenizer.__class__.__name__}"): - words, boxes = self.get_words_and_boxes() - padding_size = 10 - - # check correct behaviour if no pad_token_id exists and add it eventually - self._check_no_pad_token_padding(tokenizer, words) - - padding_idx = tokenizer.pad_token_id - - # Check that it correctly pads when a maximum length is specified along with the padding flag set to True - tokenizer.padding_side = "right" - encoded_sequence = tokenizer.encode_boxes(words, boxes=boxes) - sequence_length = len(encoded_sequence) - # FIXME: the next line should be padding(max_length) to avoid warning - padded_sequence = tokenizer.encode_boxes( - words, boxes=boxes, max_length=sequence_length + padding_size, pad_to_max_length=True - ) - padded_sequence_length = len(padded_sequence) - assert sequence_length + padding_size == padded_sequence_length - assert encoded_sequence + [padding_idx] * padding_size == padded_sequence - - # Check that nothing is done when a maximum length is not specified - encoded_sequence = tokenizer.encode_boxes(words, boxes=boxes) - sequence_length = len(encoded_sequence) - - tokenizer.padding_side = "right" - padded_sequence_right = tokenizer.encode_boxes(words, boxes=boxes, pad_to_max_length=True) - padded_sequence_right_length = len(padded_sequence_right) - assert sequence_length == padded_sequence_right_length - assert encoded_sequence == padded_sequence_right - def test_padding(self, max_length=50): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): @@ -463,9 +428,6 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase): # Encode - Simple input words, boxes = self.get_words_and_boxes() - input_r = tokenizer_r.encode_boxes(words, boxes=boxes, max_length=max_length, pad_to_max_length=True) - input_p = tokenizer_p.encode_boxes(words, boxes=boxes, max_length=max_length, pad_to_max_length=True) - self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id) input_r = tokenizer_r.encode_boxes(words, boxes=boxes, max_length=max_length, padding="max_length") input_p = tokenizer_p.encode_boxes(words, boxes=boxes, max_length=max_length, padding="max_length") self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id) @@ -476,13 +438,6 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase): # Encode - Pair input question, words, boxes = self.get_question_words_and_boxes() - input_r = tokenizer_r.encode_boxes( - question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True - ) - input_p = tokenizer_p.encode_boxes( - question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True - ) - self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id) input_r = tokenizer_r.encode_boxes( question, words, boxes=boxes, max_length=max_length, padding="max_length" ) @@ -496,14 +451,6 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase): # Encode_plus - Simple input words, boxes = self.get_words_and_boxes() - input_r = tokenizer_r.encode_plus_boxes( - words, boxes=boxes, max_length=max_length, pad_to_max_length=True - ) - input_p = tokenizer_p.encode_plus_boxes( - words, boxes=boxes, max_length=max_length, pad_to_max_length=True - ) - self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id) - self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"]) input_r = tokenizer_r.encode_plus_boxes( words, boxes=boxes, max_length=max_length, padding="max_length" ) @@ -523,14 +470,6 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase): # Encode_plus - Pair input question, words, boxes = self.get_question_words_and_boxes() - input_r = tokenizer_r.encode_plus_boxes( - question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True - ) - input_p = tokenizer_p.encode_plus_boxes( - question, words, boxes=boxes, max_length=max_length, pad_to_max_length=True - ) - self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id) - self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"]) input_r = tokenizer_r.encode_plus_boxes( question, words, boxes=boxes, max_length=max_length, padding="max_length" ) @@ -549,20 +488,6 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase): # Batch_encode_plus - Simple input words, boxes = self.get_words_and_boxes_batch() - input_r = tokenizer_r.batch_encode_plus_boxes( - words, - boxes=boxes, - max_length=max_length, - pad_to_max_length=True, - ) - input_p = tokenizer_p.batch_encode_plus_boxes( - words, - boxes=boxes, - max_length=max_length, - pad_to_max_length=True, - ) - self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id) - input_r = tokenizer_r.batch_encode_plus_boxes( words, boxes=boxes, diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index 356473d11ad..b1749f281e6 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -2475,41 +2475,6 @@ class TokenizerTesterMixin: self.assertEqual(sequence_length, truncated_sequence_left_length) self.assertEqual(encoded_sequence, truncated_sequence_left) - def test_padding_to_max_length(self): - """We keep this test for backward compatibility but it should be remove when `pad_to_max_length` is deprecated.""" - tokenizers = self.get_tokenizers(do_lower_case=False) - for tokenizer in tokenizers: - with self.subTest(f"{tokenizer.__class__.__name__}"): - sequence = "Sequence" - padding_size = 10 - - # check correct behaviour if no pad_token_id exists and add it eventually - self._check_no_pad_token_padding(tokenizer, sequence) - - padding_idx = tokenizer.pad_token_id - - # Check that it correctly pads when a maximum length is specified along with the padding flag set to True - tokenizer.padding_side = "right" - encoded_sequence = tokenizer.encode(sequence) - sequence_length = len(encoded_sequence) - # FIXME: the next line should be padding(max_length) to avoid warning - padded_sequence = tokenizer.encode( - sequence, max_length=sequence_length + padding_size, pad_to_max_length=True - ) - padded_sequence_length = len(padded_sequence) - self.assertEqual(sequence_length + padding_size, padded_sequence_length) - self.assertEqual(encoded_sequence + [padding_idx] * padding_size, padded_sequence) - - # Check that nothing is done when a maximum length is not specified - encoded_sequence = tokenizer.encode(sequence) - sequence_length = len(encoded_sequence) - - tokenizer.padding_side = "right" - padded_sequence_right = tokenizer.encode(sequence, pad_to_max_length=True) - padded_sequence_right_length = len(padded_sequence_right) - self.assertEqual(sequence_length, padded_sequence_right_length) - self.assertEqual(encoded_sequence, padded_sequence_right) - def test_padding_to_multiple_of(self): tokenizers = self.get_tokenizers() for tokenizer in tokenizers: @@ -3900,9 +3865,6 @@ class TokenizerTesterMixin: pad_token_id = tokenizer_p.pad_token_id # Encode - Simple input - input_r = tokenizer_r.encode("This is a simple input", max_length=max_length, pad_to_max_length=True) - input_p = tokenizer_p.encode("This is a simple input", max_length=max_length, pad_to_max_length=True) - self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id) input_r = tokenizer_r.encode("This is a simple input", max_length=max_length, padding="max_length") input_p = tokenizer_p.encode("This is a simple input", max_length=max_length, padding="max_length") self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id) @@ -3912,13 +3874,6 @@ class TokenizerTesterMixin: self.assert_padded_input_match(input_r, input_p, len(input_r), pad_token_id) # Encode - Pair input - input_r = tokenizer_r.encode( - "This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True - ) - input_p = tokenizer_p.encode( - "This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True - ) - self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id) input_r = tokenizer_r.encode( "This is a simple input", "This is a pair", max_length=max_length, padding="max_length" ) @@ -3931,14 +3886,6 @@ class TokenizerTesterMixin: self.assert_padded_input_match(input_r, input_p, len(input_r), pad_token_id) # Encode_plus - Simple input - input_r = tokenizer_r.encode_plus( - "This is a simple input", max_length=max_length, pad_to_max_length=True - ) - input_p = tokenizer_p.encode_plus( - "This is a simple input", max_length=max_length, pad_to_max_length=True - ) - self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id) - self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"]) input_r = tokenizer_r.encode_plus( "This is a simple input", max_length=max_length, padding="max_length" ) @@ -3957,14 +3904,6 @@ class TokenizerTesterMixin: self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"]) # Encode_plus - Pair input - input_r = tokenizer_r.encode_plus( - "This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True - ) - input_p = tokenizer_p.encode_plus( - "This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True - ) - self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id) - self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"]) input_r = tokenizer_r.encode_plus( "This is a simple input", "This is a pair", max_length=max_length, padding="max_length" ) @@ -3981,18 +3920,6 @@ class TokenizerTesterMixin: self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"]) # Batch_encode_plus - Simple input - input_r = tokenizer_r.batch_encode_plus( - ["This is a simple input 1", "This is a simple input 2"], - max_length=max_length, - pad_to_max_length=True, - ) - input_p = tokenizer_p.batch_encode_plus( - ["This is a simple input 1", "This is a simple input 2"], - max_length=max_length, - pad_to_max_length=True, - ) - self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id) - input_r = tokenizer_r.batch_encode_plus( ["This is a simple input 1", "This is a simple input 2"], max_length=max_length,