diff --git a/src/transformers/models/bert/tokenization_bert_fast.py b/src/transformers/models/bert/tokenization_bert_fast.py index e55f3f36ad6..80d542367dc 100644 --- a/src/transformers/models/bert/tokenization_bert_fast.py +++ b/src/transformers/models/bert/tokenization_bert_fast.py @@ -265,7 +265,7 @@ class BertTokenizerFast(PreTrainedTokenizerFast): """ output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id] - if token_ids_1: + if token_ids_1 is not None: output += token_ids_1 + [self.sep_token_id] return output diff --git a/src/transformers/models/convbert/tokenization_convbert_fast.py b/src/transformers/models/convbert/tokenization_convbert_fast.py index 07447bb6a17..7ccc21b3e05 100644 --- a/src/transformers/models/convbert/tokenization_convbert_fast.py +++ b/src/transformers/models/convbert/tokenization_convbert_fast.py @@ -159,7 +159,7 @@ class ConvBertTokenizerFast(PreTrainedTokenizerFast): """ output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id] - if token_ids_1: + if token_ids_1 is not None: output += token_ids_1 + [self.sep_token_id] return output diff --git a/src/transformers/models/deprecated/retribert/tokenization_retribert_fast.py b/src/transformers/models/deprecated/retribert/tokenization_retribert_fast.py index 30cb69c2b32..07f7964b9f3 100644 --- a/src/transformers/models/deprecated/retribert/tokenization_retribert_fast.py +++ b/src/transformers/models/deprecated/retribert/tokenization_retribert_fast.py @@ -164,7 +164,7 @@ class RetriBertTokenizerFast(PreTrainedTokenizerFast): """ output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id] - if token_ids_1: + if token_ids_1 is not None: output += token_ids_1 + [self.sep_token_id] return output diff --git a/src/transformers/models/distilbert/tokenization_distilbert_fast.py b/src/transformers/models/distilbert/tokenization_distilbert_fast.py index dd9dcd165d4..adb90f857d7 100644 --- a/src/transformers/models/distilbert/tokenization_distilbert_fast.py +++ b/src/transformers/models/distilbert/tokenization_distilbert_fast.py @@ -190,7 +190,7 @@ class DistilBertTokenizerFast(PreTrainedTokenizerFast): """ output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id] - if token_ids_1: + if token_ids_1 is not None: output += token_ids_1 + [self.sep_token_id] return output diff --git a/src/transformers/models/electra/tokenization_electra_fast.py b/src/transformers/models/electra/tokenization_electra_fast.py index cf92dd01714..81704317f86 100644 --- a/src/transformers/models/electra/tokenization_electra_fast.py +++ b/src/transformers/models/electra/tokenization_electra_fast.py @@ -192,7 +192,7 @@ class ElectraTokenizerFast(PreTrainedTokenizerFast): """ output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id] - if token_ids_1: + if token_ids_1 is not None: output += token_ids_1 + [self.sep_token_id] return output diff --git a/src/transformers/models/funnel/tokenization_funnel_fast.py b/src/transformers/models/funnel/tokenization_funnel_fast.py index 864303eb210..17946eb74b5 100644 --- a/src/transformers/models/funnel/tokenization_funnel_fast.py +++ b/src/transformers/models/funnel/tokenization_funnel_fast.py @@ -212,7 +212,7 @@ class FunnelTokenizerFast(PreTrainedTokenizerFast): """ output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id] - if token_ids_1: + if token_ids_1 is not None: output += token_ids_1 + [self.sep_token_id] return output diff --git a/src/transformers/models/layoutlm/tokenization_layoutlm_fast.py b/src/transformers/models/layoutlm/tokenization_layoutlm_fast.py index 7ba06d7fa11..afa92abaf87 100644 --- a/src/transformers/models/layoutlm/tokenization_layoutlm_fast.py +++ b/src/transformers/models/layoutlm/tokenization_layoutlm_fast.py @@ -166,7 +166,7 @@ class LayoutLMTokenizerFast(PreTrainedTokenizerFast): """ output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id] - if token_ids_1: + if token_ids_1 is not None: output += token_ids_1 + [self.sep_token_id] return output diff --git a/src/transformers/models/lxmert/tokenization_lxmert_fast.py b/src/transformers/models/lxmert/tokenization_lxmert_fast.py index 8e58a3aafac..0584f1fe83c 100644 --- a/src/transformers/models/lxmert/tokenization_lxmert_fast.py +++ b/src/transformers/models/lxmert/tokenization_lxmert_fast.py @@ -152,7 +152,7 @@ class LxmertTokenizerFast(PreTrainedTokenizerFast): """ output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id] - if token_ids_1: + if token_ids_1 is not None: output += token_ids_1 + [self.sep_token_id] return output diff --git a/src/transformers/models/mobilebert/tokenization_mobilebert_fast.py b/src/transformers/models/mobilebert/tokenization_mobilebert_fast.py index 6bac366d237..f8d62158b22 100644 --- a/src/transformers/models/mobilebert/tokenization_mobilebert_fast.py +++ b/src/transformers/models/mobilebert/tokenization_mobilebert_fast.py @@ -150,7 +150,7 @@ class MobileBertTokenizerFast(PreTrainedTokenizerFast): """ output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id] - if token_ids_1: + if token_ids_1 is not None: output += token_ids_1 + [self.sep_token_id] return output diff --git a/src/transformers/models/realm/tokenization_realm_fast.py b/src/transformers/models/realm/tokenization_realm_fast.py index 1cc1a996653..59b23f45ee0 100644 --- a/src/transformers/models/realm/tokenization_realm_fast.py +++ b/src/transformers/models/realm/tokenization_realm_fast.py @@ -282,7 +282,7 @@ class RealmTokenizerFast(PreTrainedTokenizerFast): """ output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id] - if token_ids_1: + if token_ids_1 is not None: output += token_ids_1 + [self.sep_token_id] return output diff --git a/src/transformers/models/roformer/tokenization_roformer_fast.py b/src/transformers/models/roformer/tokenization_roformer_fast.py index d73e3cdb93c..360b76b843d 100644 --- a/src/transformers/models/roformer/tokenization_roformer_fast.py +++ b/src/transformers/models/roformer/tokenization_roformer_fast.py @@ -163,7 +163,7 @@ class RoFormerTokenizerFast(PreTrainedTokenizerFast): """ output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id] - if token_ids_1: + if token_ids_1 is not None: output += token_ids_1 + [self.sep_token_id] return output diff --git a/src/transformers/models/squeezebert/tokenization_squeezebert_fast.py b/src/transformers/models/squeezebert/tokenization_squeezebert_fast.py index bf7659ffd18..23faab71349 100644 --- a/src/transformers/models/squeezebert/tokenization_squeezebert_fast.py +++ b/src/transformers/models/squeezebert/tokenization_squeezebert_fast.py @@ -173,7 +173,7 @@ class SqueezeBertTokenizerFast(PreTrainedTokenizerFast): """ output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id] - if token_ids_1: + if token_ids_1 is not None: output += token_ids_1 + [self.sep_token_id] return output diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index a2f207c9639..523d49bc9d3 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -3209,19 +3209,27 @@ class TokenizerTesterMixin: # output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple, input_pair) # self.assertEqual(output_p, output_r) - # Input tokens id - input_simple = tokenizer_p.encode("This is a sample input", add_special_tokens=False) - input_pair = tokenizer_p.encode("This is a sample pair", add_special_tokens=False) + input_pairs = [ + ("", ""), + ("", "This is a sample pair"), + ("This is a sample input", ""), + ("This is a sample input", "This is a sample pair"), + ] - # Generate output - output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple) - output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple) - self.assertEqual(output_p, output_r) + for sample_input, sample_pair in input_pairs: + # Input tokens id + input_simple = tokenizer_p.encode(sample_input, add_special_tokens=False) + input_pair = tokenizer_p.encode(sample_pair, add_special_tokens=False) - # Generate pair output - output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple, input_pair) - output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple, input_pair) - self.assertEqual(output_p, output_r) + # Generate output + output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple) + output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple) + self.assertEqual(output_p, output_r) + + # Generate pair output + output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple, input_pair) + output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple, input_pair) + self.assertEqual(output_p, output_r) def test_padding(self, max_length=50): if not self.test_slow_tokenizer: