Add BlenderbotTokenizerFast (#13720)

* Add the support for the fast (rust) implementation of BlenbderbotTokenizer * Fix a converter and a typo in a doc * Apply the patil-suraj's suggestion * (Nitpick) Fast tokenization -> Fast Tokenization in doc * Apply the SaulLu's suggestion * Apply Narsil's suggestion to fix test pipelines * Add encoder_no_repeat_ngram_size according to the Narsil's suggestion * Revert the last (unnecessary) commit * Override pipeline config for Blenderbot to allow for larger pos. emb. * make fix-copies
2025-07-31 02:02:21 +06:00 · 2021-10-29 15:19:01 +02:00 · 2021-10-29 15:19:01 +02:00 · d37f1fb8ba
commit d37f1fb8ba
parent 5b45422b58
12 changed files with 181 additions and 6 deletions
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@ -379,7 +379,7 @@ Flax), PyTorch, and/or TensorFlow.
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |       BigBirdPegasus        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|         Blenderbot          |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|         Blenderbot          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |       BlenderbotSmall       |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
--- a/docs/source/model_doc/blenderbot.rst
+++ b/docs/source/model_doc/blenderbot.rst
@ -81,6 +81,13 @@ BlenderbotTokenizer
    :members: build_inputs_with_special_tokens


+BlenderbotTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BlenderbotTokenizerFast
+    :members: build_inputs_with_special_tokens
+
+
 BlenderbotModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/src/transformers/init.py
+++ b/src/transformers/init.py
@ -398,6 +398,7 @@ if is_tokenizers_available():
    _import_structure["models.barthez"].append("BarthezTokenizerFast")
    _import_structure["models.bert"].append("BertTokenizerFast")
    _import_structure["models.big_bird"].append("BigBirdTokenizerFast")
+    _import_structure["models.blenderbot"].append("BlenderbotTokenizerFast")
    _import_structure["models.camembert"].append("CamembertTokenizerFast")
    _import_structure["models.deberta"].append("DebertaTokenizerFast")
    _import_structure["models.distilbert"].append("DistilBertTokenizerFast")
@ -2285,6 +2286,7 @@ if TYPE_CHECKING:
        from .models.barthez import BarthezTokenizerFast
        from .models.bert import BertTokenizerFast
        from .models.big_bird import BigBirdTokenizerFast
+        from .models.blenderbot import BlenderbotTokenizerFast
        from .models.blenderbot_small import BlenderbotSmallTokenizerFast
        from .models.camembert import CamembertTokenizerFast
        from .models.clip import CLIPTokenizerFast
--- a/src/transformers/convert_slow_tokenizer.py
+++ b/src/transformers/convert_slow_tokenizer.py
@ -893,12 +893,42 @@ class LayoutLMv2Converter(Converter):
        return tokenizer


+class BlenderbotConverter(Converter):
+    def converted(self) -> Tokenizer:
+        ot = self.original_tokenizer
+        vocab = ot.encoder
+        merges = list(ot.bpe_ranks.keys())
+
+        tokenizer = Tokenizer(
+            BPE(
+                vocab=vocab,
+                merges=merges,
+                dropout=None,
+                continuing_subword_prefix="",
+                end_of_word_suffix="",
+                fuse_unk=False,
+            )
+        )
+
+        tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=ot.add_prefix_space)
+        tokenizer.decoder = decoders.ByteLevel()
+        tokenizer.post_processor = processors.TemplateProcessing(
+            single=f"$A:0 {ot.eos_token}:0",
+            special_tokens=[
+                (ot.eos_token, ot.eos_token_id),
+            ],
+        )
+
+        return tokenizer
+
+
 SLOW_TO_FAST_CONVERTERS = {
    "AlbertTokenizer": AlbertConverter,
    "BartTokenizer": RobertaConverter,
    "BarthezTokenizer": BarthezConverter,
    "BertTokenizer": BertConverter,
    "BigBirdTokenizer": BigBirdConverter,
+    "BlenderbotTokenizer": BlenderbotConverter,
    "CamembertTokenizer": CamembertConverter,
    "CLIPTokenizer": CLIPConverter,
    "ConvBertTokenizer": BertConverter,
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@ -108,7 +108,7 @@ else:
            ),
            ("marian", ("MarianTokenizer" if is_sentencepiece_available() else None, None)),
            ("blenderbot-small", ("BlenderbotSmallTokenizer", None)),
-            ("blenderbot", ("BlenderbotTokenizer", None)),
+            ("blenderbot", ("BlenderbotTokenizer", "BlenderbotTokenizerFast")),
            ("bart", ("BartTokenizer", "BartTokenizerFast")),
            ("longformer", ("LongformerTokenizer", "LongformerTokenizerFast" if is_tokenizers_available() else None)),
            ("roberta", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
--- a/src/transformers/models/blenderbot/init.py
+++ b/src/transformers/models/blenderbot/init.py
@ -18,7 +18,7 @@

 from typing import TYPE_CHECKING

-from ...file_utils import _LazyModule, is_tf_available, is_torch_available
+from ...file_utils import _LazyModule, is_tf_available, is_tokenizers_available, is_torch_available


 _import_structure = {
@ -26,6 +26,9 @@ _import_structure = {
    "tokenization_blenderbot": ["BlenderbotTokenizer"],
 }

+if is_tokenizers_available():
+    _import_structure["tokenization_blenderbot_fast"] = ["BlenderbotTokenizerFast"]
+
 if is_torch_available():
    _import_structure["modeling_blenderbot"] = [
        "BLENDERBOT_PRETRAINED_MODEL_ARCHIVE_LIST",
@ -48,6 +51,9 @@ if TYPE_CHECKING:
    from .configuration_blenderbot import BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP, BlenderbotConfig
    from .tokenization_blenderbot import BlenderbotTokenizer

+    if is_tokenizers_available():
+        from .tokenization_blenderbot_fast import BlenderbotTokenizerFast
+
    if is_torch_available():
        from .modeling_blenderbot import (
            BLENDERBOT_PRETRAINED_MODEL_ARCHIVE_LIST,
--- a/src/transformers/models/blenderbot/tokenization_blenderbot.py
+++ b/src/transformers/models/blenderbot/tokenization_blenderbot.py
@ -14,7 +14,7 @@
 # limitations under the License.
 """Tokenization class for Blenderbot."""

-from typing import TYPE_CHECKING, List
+from typing import TYPE_CHECKING, List, Optional

 from ...utils import logging
 from ..roberta.tokenization_roberta import RobertaTokenizer
@ -58,7 +58,7 @@ class BlenderbotTokenizer(RobertaTokenizer):
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES

-    def build_inputs_with_special_tokens(self, token_ids_0: List[int], token_ids_1: List[int] = None):
+    def build_inputs_with_special_tokens(self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None):
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A Blenderbot sequence has the following format:
--- a/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py
+++ b/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py
@ -0,0 +1,96 @@
+# coding=utf-8
+# Copyright 2021 The Facebook Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Tokenization class for Blenderbot."""
+
+from typing import TYPE_CHECKING, List, Optional
+
+from ...utils import logging
+from ..roberta.tokenization_roberta_fast import RobertaTokenizerFast
+from .tokenization_blenderbot import BlenderbotTokenizer
+
+
+if TYPE_CHECKING:
+    from transformers.pipelines.conversational import Conversation
+
+logger = logging.get_logger(__name__)
+
+
+VOCAB_FILES_NAMES = {
+    "vocab_file": "vocab.json",
+    "merges_file": "merges.txt",
+    "tokenizer_config_file": "tokenizer_config.json",
+}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {"facebook/blenderbot-3B": "https://huggingface.co/facebook/blenderbot-3B/resolve/main/vocab.json"},
+    "merges_file": {"facebook/blenderbot-3B": "https://huggingface.co/facebook/blenderbot-3B/resolve/main/merges.txt"},
+    "tokenizer_config_file": {
+        "facebook/blenderbot-3B": "https://huggingface.co/facebook/blenderbot-3B/resolve/main/tokenizer_config.json"
+    },
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"facebook/blenderbot-3B": 128}
+
+
+class BlenderbotTokenizerFast(RobertaTokenizerFast):
+    r"""
+    Construct a "fast" Blenderbot tokenizer (backed by HuggingFace's `tokenizers` library).
+
+    :class:`~transformers.BlenderbotFast` is nearly identical to :class:`~transformers.RobertaTokenizerFast` and runs
+    end-to-end tokenization: punctuation splitting and wordpiece. The only difference is that it doesn't add BOS token
+    to the beginning of sequences.
+
+    Refer to superclass :class:`~transformers.RobertaTokenizerFast` for usage examples and documentation concerning
+    parameters.
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    slow_tokenizer_class = BlenderbotTokenizer
+
+    def build_inputs_with_special_tokens(self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None):
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A Blenderbot sequence has the following format:
+
+        - single sequence: `` X </s>``
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Will be ignored
+
+        Returns:
+            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+        """
+        return token_ids_0 + [self.eos_token_id]
+
+    def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]:
+        inputs = []
+        for is_user, text in conversation.iter_texts():
+            if is_user:
+                # We need to space prefix as it's being done within blenderbot
+                inputs.append(" " + text)
+            else:
+                # Generated responses should contain them already.
+                inputs.append(text)
+
+        full_string = "  ".join(inputs)
+        input_ids = self.encode(full_string)
+        if len(input_ids) > self.model_max_length:
+            input_ids = input_ids[-self.model_max_length :]
+            logger.warning(f"Trimmed input from conversation as it was longer than {self.model_max_length} tokens.")
+        return input_ids
--- a/src/transformers/utils/dummy_tokenizers_objects.py
+++ b/src/transformers/utils/dummy_tokenizers_objects.py
@ -47,6 +47,15 @@ class BigBirdTokenizerFast:
        requires_backends(cls, ["tokenizers"])


+class BlenderbotTokenizerFast:
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tokenizers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["tokenizers"])
+
+
 class BlenderbotSmallTokenizerFast:
    def __init__(self, *args, **kwargs):
        requires_backends(self, ["tokenizers"])
--- a/tests/test_modeling_blenderbot.py
+++ b/tests/test_modeling_blenderbot.py
@ -137,6 +137,11 @@ class BlenderbotModelTester:
            pad_token_id=self.pad_token_id,
        )

+    def get_pipeline_config(self):
+        config = self.get_config()
+        config.max_position_embeddings = 100
+        return config
+
    def prepare_config_and_inputs_for_common(self):
        config, inputs_dict = self.prepare_config_and_inputs()
        return config, inputs_dict
--- a/tests/test_pipelines_common.py
+++ b/tests/test_pipelines_common.py
@ -124,6 +124,11 @@ class PipelineTestCaseMeta(type):
            def test(self):
                if ModelClass.__name__.endswith("ForCausalLM"):
                    tiny_config.is_encoder_decoder = False
+                    if hasattr(tiny_config, "encoder_no_repeat_ngram_size"):
+                        # specific for blenderbot which supports both decoder-only
+                        # encoder/decoder but the test config  only reflects
+                        # encoder/decoder arch
+                        tiny_config.encoder_no_repeat_ngram_size = 0
                if ModelClass.__name__.endswith("WithLMHead"):
                    tiny_config.is_decoder = True
                try:
--- a/tests/test_tokenization_blenderbot.py
+++ b/tests/test_tokenization_blenderbot.py
@ -16,8 +16,8 @@
 """Tests for Blenderbot Tokenizers, including common tests for BlenderbotSmallTokenizer."""
 import unittest

+from transformers import BlenderbotTokenizer, BlenderbotTokenizerFast
 from transformers.file_utils import cached_property
-from transformers.models.blenderbot.tokenization_blenderbot import BlenderbotTokenizer


 class Blenderbot3BTokenizerTests(unittest.TestCase):
@ -25,6 +25,10 @@ class Blenderbot3BTokenizerTests(unittest.TestCase):
    def tokenizer_3b(self):
        return BlenderbotTokenizer.from_pretrained("facebook/blenderbot-3B")

+    @cached_property
+    def rust_tokenizer_3b(self):
+        return BlenderbotTokenizerFast.from_pretrained("facebook/blenderbot-3B")
+
    def test_encode_decode_cycle(self):
        tok = self.tokenizer_3b
        src_text = " I am a small frog."
@ -32,6 +36,17 @@ class Blenderbot3BTokenizerTests(unittest.TestCase):
        decoded = tok.batch_decode(encoded, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        assert src_text == decoded

+    def test_encode_decode_cycle_rust_tokenizer(self):
+        tok = self.rust_tokenizer_3b
+        src_text = " I am a small frog."
+        encoded = tok([src_text], padding=False, truncation=False)["input_ids"]
+        decoded = tok.batch_decode(encoded, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        assert src_text == decoded
+
    def test_3B_tokenization_same_as_parlai(self):
        assert self.tokenizer_3b.add_prefix_space
        assert self.tokenizer_3b([" Sam", "Sam"]).input_ids == [[5502, 2], [5502, 2]]
+
+    def test_3B_tokenization_same_as_parlai_rust_tokenizer(self):
+        assert self.rust_tokenizer_3b.add_prefix_space
+        assert self.rust_tokenizer_3b([" Sam", "Sam"]).input_ids == [[5502, 2], [5502, 2]]