diff --git a/.circleci/config.yml b/.circleci/config.yml index 7f9c4947966..014566bbccb 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -198,7 +198,7 @@ jobs: - v0.3-build_doc-{{ checksum "setup.py" }} - v0.3-{{ checksum "setup.py" }} - run: pip install --upgrade pip - - run: pip install .[tf,torch,docs] + - run: pip install .[tf,torch,sentencepiece,docs] - save_cache: key: v0.3-build_doc-{{ checksum "setup.py" }} paths: @@ -219,7 +219,7 @@ jobs: keys: - v0.3-deploy_doc-{{ checksum "setup.py" }} - v0.3-{{ checksum "setup.py" }} - - run: pip install .[tf,torch,docs] + - run: pip install .[tf,torch,sentencepiece,docs] - save_cache: key: v0.3-deploy_doc-{{ checksum "setup.py" }} paths: diff --git a/.github/workflows/github-torch-hub.yml b/.github/workflows/github-torch-hub.yml index de419b58fb1..9fa1a58b909 100644 --- a/.github/workflows/github-torch-hub.yml +++ b/.github/workflows/github-torch-hub.yml @@ -30,8 +30,7 @@ jobs: run: | pip install --upgrade pip pip install torch - pip install numpy filelock protobuf requests tqdm regex sentencepiece sacremoses packaging - pip install tokenizers==0.9.0.rc2 + pip install numpy filelock protobuf requests tqdm regex sentencepiece sacremoses tokenizers packaging - name: Torch hub list run: | diff --git a/.gitignore b/.gitignore index ed6a27d2c00..cb6316a6989 100644 --- a/.gitignore +++ b/.gitignore @@ -9,7 +9,8 @@ __pycache__/ *.so # tests and logs -tests/fixtures +tests/fixtures/* +!tests/fixtures/sample_text_no_unicode.txt logs/ lightning_logs/ lang_code_data/ diff --git a/docs/source/task_summary.rst b/docs/source/task_summary.rst index eaef0480e09..60cae76ab0b 100644 --- a/docs/source/task_summary.rst +++ b/docs/source/task_summary.rst @@ -758,8 +758,8 @@ Here is an example of using the pipelines to do summarization. It leverages a Ba ... If convicted, Barrientos faces up to four years in prison. Her next court appearance is scheduled for May 18. ... """ -Because the summarization pipeline depends on the ``PretrainedModel.generate()`` method, we can override the default arguments -of ``PretrainedModel.generate()`` directly in the pipeline for ``max_length`` and ``min_length`` as shown below. +Because the summarization pipeline depends on the ``PreTrainedModel.generate()`` method, we can override the default arguments +of ``PreTrainedModel.generate()`` directly in the pipeline for ``max_length`` and ``min_length`` as shown below. This outputs the following summary: .. code-block:: @@ -772,7 +772,7 @@ Here is an example of doing summarization using a model and a tokenizer. The pro 1. Instantiate a tokenizer and a model from the checkpoint name. Summarization is usually done using an encoder-decoder model, such as ``Bart`` or ``T5``. 2. Define the article that should be summarized. 3. Add the T5 specific prefix "summarize: ". -4. Use the ``PretrainedModel.generate()`` method to generate the summary. +4. Use the ``PreTrainedModel.generate()`` method to generate the summary. In this example we use Google`s T5 model. Even though it was pre-trained only on a multi-task mixed dataset (including CNN / Daily Mail), it yields very good results. @@ -819,15 +819,15 @@ translation results. >>> print(translator("Hugging Face is a technology company based in New York and Paris", max_length=40)) [{'translation_text': 'Hugging Face ist ein Technologieunternehmen mit Sitz in New York und Paris.'}] -Because the translation pipeline depends on the ``PretrainedModel.generate()`` method, we can override the default arguments -of ``PretrainedModel.generate()`` directly in the pipeline as is shown for ``max_length`` above. +Because the translation pipeline depends on the ``PreTrainedModel.generate()`` method, we can override the default arguments +of ``PreTrainedModel.generate()`` directly in the pipeline as is shown for ``max_length`` above. Here is an example of doing translation using a model and a tokenizer. The process is the following: 1. Instantiate a tokenizer and a model from the checkpoint name. Summarization is usually done using an encoder-decoder model, such as ``Bart`` or ``T5``. 2. Define the article that should be summarizaed. 3. Add the T5 specific prefix "translate English to German: " -4. Use the ``PretrainedModel.generate()`` method to perform the translation. +4. Use the ``PreTrainedModel.generate()`` method to perform the translation. .. code-block:: diff --git a/examples/requirements.txt b/examples/requirements.txt index 41bb6c852aa..120a3ab5e06 100644 --- a/examples/requirements.txt +++ b/examples/requirements.txt @@ -17,3 +17,4 @@ datasets fire pytest conllu +sentencepiece != 0.1.92 diff --git a/setup.py b/setup.py index 4be03b285b4..5a6ef149d9a 100644 --- a/setup.py +++ b/setup.py @@ -92,12 +92,13 @@ extras["onnxruntime"] = ["onnxruntime>=1.4.0", "onnxruntime-tools>=1.4.2"] extras["serving"] = ["pydantic", "uvicorn", "fastapi", "starlette"] extras["all"] = extras["serving"] + ["tensorflow", "torch"] +extras["sentencepiece"] = ["sentencepiece!=0.1.92"] extras["retrieval"] = ["faiss-cpu", "datasets"] extras["testing"] = ["pytest", "pytest-xdist", "timeout-decorator", "parameterized", "psutil"] + extras["retrieval"] # sphinx-rtd-theme==0.5.0 introduced big changes in the style. extras["docs"] = ["recommonmark", "sphinx", "sphinx-markdown-tables", "sphinx-rtd-theme==0.4.3", "sphinx-copybutton"] extras["quality"] = ["black >= 20.8b1", "isort >= 5.5.4", "flake8 >= 3.8.3"] -extras["dev"] = extras["testing"] + extras["quality"] + extras["ja"] + ["scikit-learn", "tensorflow", "torch"] +extras["dev"] = extras["testing"] + extras["quality"] + extras["ja"] + ["scikit-learn", "tensorflow", "torch", "sentencepiece!=0.1.92"] setup( name="transformers", @@ -114,7 +115,7 @@ setup( packages=find_packages("src"), install_requires=[ "numpy", - "tokenizers == 0.9.0.rc2", + "tokenizers == 0.9.2", # dataclasses for Python versions that don't have it "dataclasses;python_version<'3.7'", # utilities from PyPA to e.g. compare versions diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 358901967a5..af91c2e656a 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -92,6 +92,7 @@ from .file_utils import ( MODEL_CARD_NAME, PYTORCH_PRETRAINED_BERT_CACHE, PYTORCH_TRANSFORMERS_CACHE, + SPIECE_UNDERLINE, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, TRANSFORMERS_CACHE, @@ -104,8 +105,10 @@ from .file_utils import ( is_faiss_available, is_psutil_available, is_py3nvml_available, + is_sentencepiece_available, is_sklearn_available, is_tf_available, + is_tokenizers_available, is_torch_available, is_torch_tpu_available, ) @@ -152,49 +155,41 @@ from .pipelines import ( from .retrieval_rag import RagRetriever # Tokenizers -from .tokenization_albert import AlbertTokenizer, AlbertTokenizerFast from .tokenization_auto import TOKENIZER_MAPPING, AutoTokenizer -from .tokenization_bart import BartTokenizer, BartTokenizerFast -from .tokenization_bert import BasicTokenizer, BertTokenizer, BertTokenizerFast, WordpieceTokenizer -from .tokenization_bert_generation import BertGenerationTokenizer +from .tokenization_bart import BartTokenizer +from .tokenization_bert import BasicTokenizer, BertTokenizer, WordpieceTokenizer from .tokenization_bert_japanese import BertJapaneseTokenizer, CharacterTokenizer, MecabTokenizer from .tokenization_bertweet import BertweetTokenizer from .tokenization_blenderbot import BlenderbotSmallTokenizer, BlenderbotTokenizer -from .tokenization_camembert import CamembertTokenizer, CamembertTokenizerFast from .tokenization_ctrl import CTRLTokenizer from .tokenization_deberta import DebertaTokenizer -from .tokenization_distilbert import DistilBertTokenizer, DistilBertTokenizerFast +from .tokenization_distilbert import DistilBertTokenizer from .tokenization_dpr import ( DPRContextEncoderTokenizer, - DPRContextEncoderTokenizerFast, DPRQuestionEncoderTokenizer, - DPRQuestionEncoderTokenizerFast, + DPRReaderOutput, DPRReaderTokenizer, - DPRReaderTokenizerFast, ) -from .tokenization_electra import ElectraTokenizer, ElectraTokenizerFast +from .tokenization_electra import ElectraTokenizer from .tokenization_flaubert import FlaubertTokenizer from .tokenization_fsmt import FSMTTokenizer -from .tokenization_funnel import FunnelTokenizer, FunnelTokenizerFast -from .tokenization_gpt2 import GPT2Tokenizer, GPT2TokenizerFast -from .tokenization_herbert import HerbertTokenizer, HerbertTokenizerFast -from .tokenization_layoutlm import LayoutLMTokenizer, LayoutLMTokenizerFast -from .tokenization_longformer import LongformerTokenizer, LongformerTokenizerFast -from .tokenization_lxmert import LxmertTokenizer, LxmertTokenizerFast -from .tokenization_mbart import MBartTokenizer, MBartTokenizerFast -from .tokenization_mobilebert import MobileBertTokenizer, MobileBertTokenizerFast -from .tokenization_openai import OpenAIGPTTokenizer, OpenAIGPTTokenizerFast -from .tokenization_pegasus import PegasusTokenizer, PegasusTokenizerFast +from .tokenization_funnel import FunnelTokenizer +from .tokenization_gpt2 import GPT2Tokenizer +from .tokenization_herbert import HerbertTokenizer +from .tokenization_layoutlm import LayoutLMTokenizer +from .tokenization_longformer import LongformerTokenizer +from .tokenization_lxmert import LxmertTokenizer +from .tokenization_mobilebert import MobileBertTokenizer +from .tokenization_openai import OpenAIGPTTokenizer from .tokenization_phobert import PhobertTokenizer from .tokenization_rag import RagTokenizer -from .tokenization_reformer import ReformerTokenizer, ReformerTokenizerFast -from .tokenization_retribert import RetriBertTokenizer, RetriBertTokenizerFast -from .tokenization_roberta import RobertaTokenizer, RobertaTokenizerFast -from .tokenization_squeezebert import SqueezeBertTokenizer, SqueezeBertTokenizerFast -from .tokenization_t5 import T5Tokenizer, T5TokenizerFast +from .tokenization_retribert import RetriBertTokenizer +from .tokenization_roberta import RobertaTokenizer +from .tokenization_squeezebert import SqueezeBertTokenizer from .tokenization_transfo_xl import TransfoXLCorpus, TransfoXLTokenizer from .tokenization_utils import PreTrainedTokenizer from .tokenization_utils_base import ( + AddedToken, BatchEncoding, CharSpan, PreTrainedTokenizerBase, @@ -202,10 +197,59 @@ from .tokenization_utils_base import ( TensorType, TokenSpan, ) -from .tokenization_utils_fast import PreTrainedTokenizerFast from .tokenization_xlm import XLMTokenizer -from .tokenization_xlm_roberta import XLMRobertaTokenizer, XLMRobertaTokenizerFast -from .tokenization_xlnet import SPIECE_UNDERLINE, XLNetTokenizer, XLNetTokenizerFast + + +if is_sentencepiece_available(): + from .tokenization_albert import AlbertTokenizer + from .tokenization_bert_generation import BertGenerationTokenizer + from .tokenization_camembert import CamembertTokenizer + from .tokenization_marian import MarianTokenizer + from .tokenization_mbart import MBartTokenizer + from .tokenization_pegasus import PegasusTokenizer + from .tokenization_reformer import ReformerTokenizer + from .tokenization_t5 import T5Tokenizer + from .tokenization_xlm_roberta import XLMRobertaTokenizer + from .tokenization_xlnet import XLNetTokenizer +else: + from .utils.dummy_sentencepiece_objects import * + +if is_tokenizers_available(): + from .tokenization_albert_fast import AlbertTokenizerFast + from .tokenization_bart_fast import BartTokenizerFast + from .tokenization_bert_fast import BertTokenizerFast + from .tokenization_camembert_fast import CamembertTokenizerFast + from .tokenization_distilbert_fast import DistilBertTokenizerFast + from .tokenization_dpr_fast import ( + DPRContextEncoderTokenizerFast, + DPRQuestionEncoderTokenizerFast, + DPRReaderTokenizerFast, + ) + from .tokenization_electra_fast import ElectraTokenizerFast + from .tokenization_funnel_fast import FunnelTokenizerFast + from .tokenization_gpt2_fast import GPT2TokenizerFast + from .tokenization_herbert_fast import HerbertTokenizerFast + from .tokenization_layoutlm_fast import LayoutLMTokenizerFast + from .tokenization_longformer_fast import LongformerTokenizerFast + from .tokenization_lxmert_fast import LxmertTokenizerFast + from .tokenization_mbart_fast import MBartTokenizerFast + from .tokenization_mobilebert_fast import MobileBertTokenizerFast + from .tokenization_openai_fast import OpenAIGPTTokenizerFast + from .tokenization_pegasus_fast import PegasusTokenizerFast + from .tokenization_reformer_fast import ReformerTokenizerFast + from .tokenization_retribert_fast import RetriBertTokenizerFast + from .tokenization_roberta_fast import RobertaTokenizerFast + from .tokenization_squeezebert_fast import SqueezeBertTokenizerFast + from .tokenization_t5_fast import T5TokenizerFast + from .tokenization_utils_fast import PreTrainedTokenizerFast + from .tokenization_xlm_roberta_fast import XLMRobertaTokenizerFast + from .tokenization_xlnet_fast import XLNetTokenizerFast + + if is_sentencepiece_available(): + from .convert_slow_tokenizer import SLOW_TO_FAST_CONVERTERS, convert_slow_tokenizer +else: + from .utils.dummy_tokenizers_objects import * + # Trainer from .trainer_callback import ( @@ -539,7 +583,6 @@ if is_torch_available(): get_linear_schedule_with_warmup, get_polynomial_decay_schedule_with_warmup, ) - from .tokenization_marian import MarianTokenizer # Trainer from .trainer import Trainer diff --git a/src/transformers/configuration_auto.py b/src/transformers/configuration_auto.py index 731219babc7..d0aecaf739e 100644 --- a/src/transformers/configuration_auto.py +++ b/src/transformers/configuration_auto.py @@ -266,7 +266,7 @@ class AutoConfig: our S3, e.g., ``dbmdz/bert-base-german-cased``. - A path to a `directory` containing a configuration file saved using the :meth:`~transformers.PretrainedConfig.save_pretrained` method, or the - :meth:`~transformers.PretrainedModel.save_pretrained` method, e.g., ``./my_model_directory/``. + :meth:`~transformers.PreTrainedModel.save_pretrained` method, e.g., ``./my_model_directory/``. - A path or url to a saved configuration JSON `file`, e.g., ``./my_model_directory/configuration.json``. cache_dir (:obj:`str`, `optional`): diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py index 0509a0e1ad6..c4044aece51 100755 --- a/src/transformers/configuration_utils.py +++ b/src/transformers/configuration_utils.py @@ -43,6 +43,9 @@ class PretrainedConfig(object): recreate the correct object in :class:`~transformers.AutoConfig`. Args: + name_or_path (:obj:`str`, `optional`, defaults to :obj:`""`): + Store the string that was passed to :func:`~transformers.PreTrainedModel.from_pretrained` or :func:`~transformers.TFPreTrainedModel.from_pretrained` + as ``pretrained_model_name_or_path`` if the configuration was created with such a method. output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`False`): Whether or not the model should return all hidden-states. output_attentions (:obj:`bool`, `optional`, defaults to :obj:`False`): @@ -206,6 +209,9 @@ class PretrainedConfig(object): # TPU arguments self.xla_device = kwargs.pop("xla_device", None) + # Name or path to the pretrained checkpoint + self._name_or_path = str(kwargs.pop("name_or_path", "")) + # Additional attributes without default values for key, value in kwargs.items(): try: @@ -214,6 +220,14 @@ class PretrainedConfig(object): logger.error("Can't set {} with value {} for {}".format(key, value, self)) raise err + @property + def name_or_path(self) -> str: + return self._name_or_path + + @name_or_path.setter + def name_or_path(self, value): + self._name_or_path = str(value) # Make sure that name_or_path is a string (for JSON encoding) + @property def use_return_dict(self) -> bool: """ diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py index e65c2f43996..59a46d5f4a9 100644 --- a/src/transformers/convert_slow_tokenizer.py +++ b/src/transformers/convert_slow_tokenizer.py @@ -20,13 +20,14 @@ from typing import Dict, List, Tuple -from sentencepiece import SentencePieceProcessor from tokenizers import Tokenizer, decoders, normalizers, pre_tokenizers, processors from tokenizers.models import BPE, Unigram, WordPiece # from transformers.tokenization_openai import OpenAIGPTTokenizer from transformers.utils import sentencepiece_model_pb2 as model +from .file_utils import requires_sentencepiece + class SentencePieceExtractor: """ @@ -35,7 +36,9 @@ class SentencePieceExtractor: """ def __init__(self, model: str): - # Get SentencePiece + requires_sentencepiece(self) + from sentencepiece import SentencePieceProcessor + self.sp = SentencePieceProcessor() self.sp.Load(model) @@ -568,11 +571,10 @@ class T5Converter(SpmConverter): ) -CONVERTERS = { +SLOW_TO_FAST_CONVERTERS = { "AlbertTokenizer": AlbertConverter, - "BertTokenizer": BertConverter, - "BertGenerationTokenizer": BertGenerationConverter, "BartTokenizer": RobertaConverter, + "BertTokenizer": BertConverter, "CamembertTokenizer": CamembertConverter, "DistilBertTokenizer": BertConverter, "DPRReaderTokenizer": BertConverter, @@ -582,12 +584,17 @@ CONVERTERS = { "FunnelTokenizer": FunnelConverter, "GPT2Tokenizer": GPT2Converter, "HerbertTokenizer": HerbertConverter, + "LayoutLMTokenizer": BertConverter, + "LongformerTokenizer": RobertaConverter, "LxmertTokenizer": BertConverter, "MBartTokenizer": MBartConverter, + "MobileBertTokenizer": BertConverter, "OpenAIGPTTokenizer": OpenAIGPTConverter, "PegasusTokenizer": PegasusConverter, "ReformerTokenizer": ReformerConverter, + "RetriBertTokenizer": BertConverter, "RobertaTokenizer": RobertaConverter, + "SqueezeBertTokenizer": BertConverter, "T5Tokenizer": T5Converter, "XLMRobertaTokenizer": XLMRobertaConverter, "XLNetTokenizer": XLNetConverter, @@ -595,5 +602,26 @@ CONVERTERS = { def convert_slow_tokenizer(transformer_tokenizer) -> Tokenizer: - converter_class = CONVERTERS[transformer_tokenizer.__class__.__name__] + """Utilities to convert a slow tokenizer instance in a fast tokenizer instance. + + Args: + transformer_tokenizer (:class:`~transformers.tokenization_utils_base.PreTrainedTokenizer`): + Instance of a slow tokenizer to convert in the backend tokenizer for + :class:`~transformers.tokenization_utils_base.PreTrainedTokenizerFast`. + + Return: + A instance of :class:`~tokenizers.Tokenizer` to be used as the backend tokenizer of a + :class:`~transformers.tokenization_utils_base.PreTrainedTokenizerFast` + """ + + tokenizer_class_name = transformer_tokenizer.__class__.__name__ + + if tokenizer_class_name not in SLOW_TO_FAST_CONVERTERS: + raise ValueError( + f"An instance of tokenizer class {tokenizer_class_name} cannot be converted in a Fast tokenizer instance. " + f"No converter was found. Currently available slow->fast convertors: {list(SLOW_TO_FAST_CONVERTERS.keys())}" + ) + + converter_class = SLOW_TO_FAST_CONVERTERS[tokenizer_class_name] + return converter_class(transformer_tokenizer).converted() diff --git a/src/transformers/convert_slow_tokenizers_checkpoints_to_fast.py b/src/transformers/convert_slow_tokenizers_checkpoints_to_fast.py new file mode 100755 index 00000000000..3245b840e85 --- /dev/null +++ b/src/transformers/convert_slow_tokenizers_checkpoints_to_fast.py @@ -0,0 +1,129 @@ +# coding=utf-8 +# Copyright 2018 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Convert slow tokenizers checkpoints in fast (serialization format of the `tokenizers` library) """ + +import argparse +import os + +import transformers +from transformers.convert_slow_tokenizer import SLOW_TO_FAST_CONVERTERS +from transformers.utils import logging + + +logging.set_verbosity_info() + +logger = logging.get_logger(__name__) + + +TOKENIZER_CLASSES = {name: getattr(transformers, name + "Fast") for name in SLOW_TO_FAST_CONVERTERS} + + +def convert_slow_checkpoint_to_fast(tokenizer_name, checkpoint_name, dump_path, force_download): + if tokenizer_name is not None and tokenizer_name not in TOKENIZER_CLASSES: + raise ValueError("Unrecognized tokenizer name, should be one of {}.".format(list(TOKENIZER_CLASSES.keys()))) + + if tokenizer_name is None: + tokenizer_names = TOKENIZER_CLASSES + else: + tokenizer_names = {tokenizer_name: getattr(transformers, tokenizer_name + "Fast")} + + logger.info(f"Loading tokenizer classes: {tokenizer_names}") + + for tokenizer_name in tokenizer_names: + tokenizer_class = TOKENIZER_CLASSES[tokenizer_name] + + add_prefix = True + if checkpoint_name is None: + checkpoint_names = list(tokenizer_class.max_model_input_sizes.keys()) + else: + checkpoint_names = [checkpoint_name] + + logger.info(f"For tokenizer {tokenizer_class.__class__.__name__} loading checkpoints: {checkpoint_names}") + + for checkpoint in checkpoint_names: + logger.info(f"Loading {tokenizer_class.__class__.__name__} {checkpoint}") + + # Load tokenizer + tokenizer = tokenizer_class.from_pretrained(checkpoint, force_download=force_download) + + # Save fast tokenizer + logger.info( + "Save fast tokenizer to {} with prefix {} add_prefix {}".format(dump_path, checkpoint, add_prefix) + ) + + # For organization names we create sub-directories + if "/" in checkpoint: + checkpoint_directory, checkpoint_prefix_name = checkpoint.split("/") + dump_path_full = os.path.join(dump_path, checkpoint_directory) + elif add_prefix: + checkpoint_prefix_name = checkpoint + dump_path_full = dump_path + else: + checkpoint_prefix_name = None + dump_path_full = dump_path + + logger.info( + "=> {} with prefix {}, add_prefix {}".format(dump_path_full, checkpoint_prefix_name, add_prefix) + ) + + file_path = list(tokenizer.pretrained_vocab_files_map.values())[0][checkpoint] + next_char = file_path.split(checkpoint)[-1][0] + if next_char == "/": + dump_path_full = os.path.join(dump_path_full, checkpoint_prefix_name) + checkpoint_prefix_name = None + + logger.info( + "=> {} with prefix {}, add_prefix {}".format(dump_path_full, checkpoint_prefix_name, add_prefix) + ) + + file_names = tokenizer.save_pretrained( + dump_path_full, legacy_format=False, filename_prefix=checkpoint_prefix_name + ) + logger.info("=> File names {}".format(file_names)) + + for file_name in file_names: + if not file_name.endswith("tokenizer.json"): + os.remove(file_name) + logger.info("=> removing {}".format(file_name)) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + # Required parameters + parser.add_argument( + "--dump_path", default=None, type=str, required=True, help="Path to output generated fast tokenizer files." + ) + parser.add_argument( + "--tokenizer_name", + default=None, + type=str, + help="Optional tokenizer type selected in the list of {}. If not given, will download and convert all the checkpoints from AWS.".format( + list(TOKENIZER_CLASSES.keys()) + ), + ) + parser.add_argument( + "--checkpoint_name", + default=None, + type=str, + help="Optional checkpoint name. If not given, will download and convert the canonical checkpoints from AWS.", + ) + parser.add_argument( + "--force_download", + action="store_true", + help="Re-dowload checkpoints.", + ) + args = parser.parse_args() + + convert_slow_checkpoint_to_fast(args.tokenizer_name, args.checkpoint_name, args.dump_path, args.force_download) diff --git a/src/transformers/data/data_collator.py b/src/transformers/data/data_collator.py index e46bba13e34..f2fec6bb2e9 100644 --- a/src/transformers/data/data_collator.py +++ b/src/transformers/data/data_collator.py @@ -4,9 +4,7 @@ from typing import Any, Callable, Dict, List, NewType, Optional, Tuple, Union import torch from torch.nn.utils.rnn import pad_sequence -from ..tokenization_utils import PreTrainedTokenizer -from ..tokenization_utils_base import BatchEncoding, PaddingStrategy -from ..tokenization_utils_fast import PreTrainedTokenizerFast +from ..tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTrainedTokenizerBase InputDataClass = NewType("InputDataClass", Any) @@ -94,7 +92,7 @@ class DataCollatorWithPadding: >= 7.5 (Volta). """ - tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast] + tokenizer: PreTrainedTokenizerBase padding: Union[bool, str, PaddingStrategy] = True max_length: Optional[int] = None pad_to_multiple_of: Optional[int] = None @@ -124,7 +122,7 @@ class DataCollatorForLanguageModeling: - preprocesses batches for masked language modeling """ - tokenizer: PreTrainedTokenizer + tokenizer: PreTrainedTokenizerBase mlm: bool = True mlm_probability: float = 0.15 @@ -274,7 +272,7 @@ class DataCollatorForPermutationLanguageModeling: - preprocesses batches for permutation language modeling with procedures specific to XLNet """ - tokenizer: PreTrainedTokenizer + tokenizer: PreTrainedTokenizerBase plm_probability: float = 1 / 6 max_span_length: int = 5 # maximum length of a span of masked tokens @@ -406,7 +404,7 @@ class DataCollatorForNextSentencePrediction: - preprocesses batches for masked language modeling """ - tokenizer: PreTrainedTokenizer + tokenizer: PreTrainedTokenizerBase mlm: bool = True block_size: int = 512 short_seq_probability: float = 0.1 diff --git a/src/transformers/data/datasets/glue.py b/src/transformers/data/datasets/glue.py index 412cd47fcc2..9b1cb013de1 100644 --- a/src/transformers/data/datasets/glue.py +++ b/src/transformers/data/datasets/glue.py @@ -9,10 +9,7 @@ from torch.utils.data.dataset import Dataset from filelock import FileLock -from ...tokenization_bart import BartTokenizer, BartTokenizerFast -from ...tokenization_roberta import RobertaTokenizer, RobertaTokenizerFast -from ...tokenization_utils import PreTrainedTokenizer -from ...tokenization_xlm_roberta import XLMRobertaTokenizer +from ...tokenization_utils_base import PreTrainedTokenizerBase from ...utils import logging from ..processors.glue import glue_convert_examples_to_features, glue_output_modes, glue_processors from ..processors.utils import InputFeatures @@ -69,7 +66,7 @@ class GlueDataset(Dataset): def __init__( self, args: GlueDataTrainingArguments, - tokenizer: PreTrainedTokenizer, + tokenizer: PreTrainedTokenizerBase, limit_length: Optional[int] = None, mode: Union[str, Split] = Split.train, cache_dir: Optional[str] = None, @@ -93,12 +90,12 @@ class GlueDataset(Dataset): ), ) label_list = self.processor.get_labels() - if args.task_name in ["mnli", "mnli-mm"] and tokenizer.__class__ in ( - RobertaTokenizer, - RobertaTokenizerFast, - XLMRobertaTokenizer, - BartTokenizer, - BartTokenizerFast, + if args.task_name in ["mnli", "mnli-mm"] and tokenizer.__class__.__name__ in ( + "RobertaTokenizer", + "RobertaTokenizerFast", + "XLMRobertaTokenizer", + "BartTokenizer", + "BartTokenizerFast", ): # HACK(label indices are swapped in RoBERTa pretrained model) label_list[1], label_list[2] = label_list[2], label_list[1] diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py index ea319bcae9a..5832bb0993e 100644 --- a/src/transformers/file_utils.py +++ b/src/transformers/file_utils.py @@ -157,6 +157,24 @@ except (AttributeError, ImportError, KeyError): _in_notebook = False +try: + import sentencepiece # noqa: F401 + + _sentencepiece_available = True + +except ImportError: + _sentencepiece_available = False + + +try: + import tokenizers # noqa: F401 + + _tokenizers_available = True + +except ImportError: + _tokenizers_available = False + + default_cache_path = os.path.join(torch_cache_home, "transformers") @@ -170,6 +188,8 @@ TF_WEIGHTS_NAME = "model.ckpt" CONFIG_NAME = "config.json" MODEL_CARD_NAME = "modelcard.json" +SENTENCEPIECE_UNDERLINE = "▁" +SPIECE_UNDERLINE = SENTENCEPIECE_UNDERLINE # Kept for backward compatibility MULTIPLE_CHOICE_DUMMY_INPUTS = [ [[0, 1, 0, 1], [1, 0, 0, 1]] @@ -217,6 +237,18 @@ def is_faiss_available(): return _faiss_available +def is_sklearn_available(): + return _has_sklearn + + +def is_sentencepiece_available(): + return _sentencepiece_available + + +def is_tokenizers_available(): + return _tokenizers_available + + def is_in_notebook(): return _in_notebook @@ -234,10 +266,6 @@ def torch_only_method(fn): return wrapper -def is_sklearn_available(): - return _has_sklearn - - DATASETS_IMPORT_ERROR = """ {0} requires the 🀗 Datasets library but it was not found in your enviromnent. You can install it with: ``` @@ -255,6 +283,25 @@ that python file if that's the case. """ +TOKENIZERS_IMPORT_ERROR = """ +{0} requires the 🀗 Tokenizers library but it was not found in your enviromnent. You can install it with: +``` +pip install tokenizers +``` +In a notebook or a colab, you can install it by executing a cell with +``` +!pip install tokenizers +``` +""" + + +SENTENCEPIECE_IMPORT_ERROR = """ +{0} requires the SentencePiece library but it was not found in your enviromnent. Checkout the instructions on the +installation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones +that match your enviromnent. +""" + + FAISS_IMPORT_ERROR = """ {0} requires the faiss library but it was not found in your enviromnent. Checkout the instructions on the installation page of its repo: https://github.com/facebookresearch/faiss/blob/master/INSTALL.md and follow the ones @@ -316,6 +363,18 @@ def requires_tf(obj): raise ImportError(TENSORFLOW_IMPORT_ERROR.format(name)) +def requires_tokenizers(obj): + name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__ + if not is_tokenizers_available(): + raise ImportError(TOKENIZERS_IMPORT_ERROR.format(name)) + + +def requires_sentencepiece(obj): + name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__ + if not is_sentencepiece_available(): + raise ImportError(SENTENCEPIECE_IMPORT_ERROR.format(name)) + + def add_start_docstrings(*docstr): def docstring_decorator(fn): fn.__doc__ = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "") diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py index 1a2e83d028b..68140a46833 100644 --- a/src/transformers/modeling_tf_utils.py +++ b/src/transformers/modeling_tf_utils.py @@ -346,8 +346,9 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin): self.__class__.__name__, self.__class__.__name__ ) ) - # Save config in model + # Save config and origin of the pretrained weights if given in model self.config = config + self.name_or_path = config.name_or_path def get_input_embeddings(self) -> tf.keras.layers.Layer: """ @@ -690,6 +691,8 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin): else: resolved_archive_file = None + config.name_or_path = pretrained_model_name_or_path + # Instantiate model. model = cls(config, *model_args, **model_kwargs) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 4a24fedd3f7..3b6c4ce1236 100755 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -432,8 +432,9 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin): self.__class__.__name__, self.__class__.__name__ ) ) - # Save config in model + # Save config and origin of the pretrained weights if given in model self.config = config + self.name_or_path = config.name_or_path @property def base_model(self) -> nn.Module: @@ -933,6 +934,8 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin): else: resolved_archive_file = None + config.name_or_path = pretrained_model_name_or_path + # Instantiate model. model = cls(config, *model_args, **model_kwargs) diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py index afc70672ef7..96635ec25e3 100644 --- a/src/transformers/testing_utils.py +++ b/src/transformers/testing_utils.py @@ -10,7 +10,15 @@ from distutils.util import strtobool from io import StringIO from pathlib import Path -from .file_utils import _datasets_available, _faiss_available, _tf_available, _torch_available, _torch_tpu_available +from .file_utils import ( + _datasets_available, + _faiss_available, + _sentencepiece_available, + _tf_available, + _tokenizers_available, + _torch_available, + _torch_tpu_available, +) SMALL_MODEL_IDENTIFIER = "julien-c/bert-xsmall-dummy" @@ -107,6 +115,32 @@ def require_tf(test_case): return test_case +def require_sentencepiece(test_case): + """ + Decorator marking a test that requires SentencePiece. + + These tests are skipped when SentencePiece isn't installed. + + """ + if not _sentencepiece_available: + return unittest.skip("test requires SentencePiece")(test_case) + else: + return test_case + + +def require_tokenizers(test_case): + """ + Decorator marking a test that requires 🀗 Tokenizers. + + These tests are skipped when 🀗 Tokenizers isn't installed. + + """ + if not _tokenizers_available: + return unittest.skip("test requires tokenizers")(test_case) + else: + return test_case + + def require_multigpu(test_case): """ Decorator marking a test that requires a multi-GPU setup (in PyTorch). diff --git a/src/transformers/tokenization_albert.py b/src/transformers/tokenization_albert.py index 424630b21fe..a0e00baf255 100644 --- a/src/transformers/tokenization_albert.py +++ b/src/transformers/tokenization_albert.py @@ -18,10 +18,11 @@ import os import unicodedata from shutil import copyfile -from typing import List, Optional +from typing import List, Optional, Tuple + +import sentencepiece as spm from .tokenization_utils import PreTrainedTokenizer -from .tokenization_utils_fast import PreTrainedTokenizerFast from .utils import logging @@ -138,15 +139,6 @@ class AlbertTokenizer(PreTrainedTokenizer): **kwargs, ) - try: - import sentencepiece as spm - except ImportError: - logger.warning( - "You need to install SentencePiece to use AlbertTokenizer: https://github.com/google/sentencepiece" - "pip install sentencepiece" - ) - raise - self.do_lower_case = do_lower_case self.remove_space = remove_space self.keep_accents = keep_accents @@ -171,14 +163,6 @@ class AlbertTokenizer(PreTrainedTokenizer): def __setstate__(self, d): self.__dict__ = d - try: - import sentencepiece as spm - except ImportError: - logger.warning( - "You need to install SentencePiece to use AlbertTokenizer: https://github.com/google/sentencepiece" - "pip install sentencepiece" - ) - raise self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(self.vocab_file) @@ -321,225 +305,14 @@ class AlbertTokenizer(PreTrainedTokenizer): return len(cls + token_ids_0 + sep) * [0] return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] - def save_vocabulary(self, save_directory): - """ - Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory. - - Args: - save_directory (:obj:`str`): - The directory in which to save the vocabulary. - - Returns: - :obj:`Tuple(str)`: Paths to the files saved. - """ + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: if not os.path.isdir(save_directory): logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) return - out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"]) - - if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): - copyfile(self.vocab_file, out_vocab_file) - - return (out_vocab_file,) - - -class AlbertTokenizerFast(PreTrainedTokenizerFast): - """ - Construct a "fast" ALBERT tokenizer (backed by HuggingFace's `tokenizers` library). Based on - `SentencePiece `__. - - This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main - methods. Users should refer to this superclass for more information regarding those methods. - - Args: - vocab_file (:obj:`str`): - `SentencePiece `__ file (generally has a `.spm` extension) that - contains the vocabulary necessary to instantiate a tokenizer. - do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`): - Whether or not to lowercase the input when tokenizing. - remove_space (:obj:`bool`, `optional`, defaults to :obj:`True`): - Whether or not to strip the text when tokenizing (removing excess spaces before and after the string). - keep_accents (:obj:`bool`, `optional`, defaults to :obj:`False`): - Whether or not to keep accents when tokenizing. - bos_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`): - The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. - - .. note:: - - When building a sequence using special tokens, this is not the token that is used for the beginning - of sequence. The token used is the :obj:`cls_token`. - eos_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`): - The end of sequence token. - - .. note:: - - When building a sequence using special tokens, this is not the token that is used for the end - of sequence. The token used is the :obj:`sep_token`. - unk_token (:obj:`str`, `optional`, defaults to :obj:`""`): - The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this - token instead. - sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`): - The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences - for sequence classification or for a text and a question for question answering. - It is also used as the last token of a sequence built with special tokens. - pad_token (:obj:`str`, `optional`, defaults to :obj:`""`): - The token used for padding, for example when batching sequences of different lengths. - cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`): - The classifier token which is used when doing sequence classification (classification of the whole - sequence instead of per-token classification). It is the first token of the sequence when built with - special tokens. - mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`): - The token used for masking values. This is the token used when training this model with masked language - modeling. This is the token which the model will try to predict. - - Attributes: - sp_model (:obj:`SentencePieceProcessor`): - The `SentencePiece` processor that is used for every conversion (string, tokens and IDs). - """ - - vocab_files_names = VOCAB_FILES_NAMES - pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP - max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - slow_tokenizer_class = AlbertTokenizer - - def __init__( - self, - vocab_file, - do_lower_case=True, - remove_space=True, - keep_accents=False, - bos_token="[CLS]", - eos_token="[SEP]", - unk_token="", - sep_token="[SEP]", - pad_token="", - cls_token="[CLS]", - mask_token="[MASK]", - **kwargs - ): - super().__init__( - vocab_file, - do_lower_case=do_lower_case, - remove_space=remove_space, - keep_accents=keep_accents, - bos_token=bos_token, - eos_token=eos_token, - unk_token=unk_token, - sep_token=sep_token, - pad_token=pad_token, - cls_token=cls_token, - mask_token=mask_token, - **kwargs, + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] ) - self.do_lower_case = do_lower_case - self.remove_space = remove_space - self.keep_accents = keep_accents - self.vocab_file = vocab_file - - def build_inputs_with_special_tokens( - self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None - ) -> List[int]: - """ - Build model inputs from a sequence or a pair of sequence for sequence classification tasks - by concatenating and adding special tokens. - An ALBERT sequence has the following format: - - - single sequence: ``[CLS] X [SEP]`` - - pair of sequences: ``[CLS] A [SEP] B [SEP]`` - - Args: - token_ids_0 (:obj:`List[int]`): - List of IDs to which the special tokens will be added. - token_ids_1 (:obj:`List[int]`, `optional`): - Optional second list of IDs for sequence pairs. - - Returns: - :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. - """ - sep = [self.sep_token_id] - cls = [self.cls_token_id] - if token_ids_1 is None: - return cls + token_ids_0 + sep - return cls + token_ids_0 + sep + token_ids_1 + sep - - def get_special_tokens_mask( - self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False - ) -> List[int]: - """ - Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding - special tokens using the tokenizer ``prepare_for_model`` method. - - Args: - token_ids_0 (:obj:`List[int]`): - List of IDs. - token_ids_1 (:obj:`List[int]`, `optional`): - Optional second list of IDs for sequence pairs. - already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): - Whether or not the token list is already formatted with special tokens for the model. - - Returns: - :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. - """ - if already_has_special_tokens: - if token_ids_1 is not None: - raise ValueError( - "You should not supply a second sequence if the provided sequence of " - "ids is already formatted with special tokens for the model." - ) - return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) - - if token_ids_1 is not None: - return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] - return [1] + ([0] * len(token_ids_0)) + [1] - - def create_token_type_ids_from_sequences( - self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None - ) -> List[int]: - """ - Create a mask from the two sequences passed to be used in a sequence-pair classification task. - An ALBERT sequence pair mask has the following format: - - :: - - 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 - | first sequence | second sequence | - - If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s). - - Args: - token_ids_0 (:obj:`List[int]`): - List of IDs. - token_ids_1 (:obj:`List[int]`, `optional`): - Optional second list of IDs for sequence pairs. - - Returns: - :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given - sequence(s). - """ - sep = [self.sep_token_id] - cls = [self.cls_token_id] - - if token_ids_1 is None: - return len(cls + token_ids_0 + sep) * [0] - return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] - - def save_vocabulary(self, save_directory): - """ - Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory. - - Args: - save_directory (:obj:`str`): - The directory in which to save the vocabulary. - - Returns: - :obj:`Tuple(str)`: Paths to the files saved. - """ - if not os.path.isdir(save_directory): - logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) - return - out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"]) - if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): copyfile(self.vocab_file, out_vocab_file) diff --git a/src/transformers/tokenization_albert_fast.py b/src/transformers/tokenization_albert_fast.py new file mode 100644 index 00000000000..0de765801e3 --- /dev/null +++ b/src/transformers/tokenization_albert_fast.py @@ -0,0 +1,260 @@ +# coding=utf-8 +# Copyright 2018 Google AI, Google Brain and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Tokenization classes for ALBERT model.""" + + +import os +from shutil import copyfile +from typing import List, Optional, Tuple + +from .file_utils import is_sentencepiece_available +from .tokenization_utils_fast import PreTrainedTokenizerFast +from .utils import logging + + +if is_sentencepiece_available(): + from .tokenization_albert import AlbertTokenizer +else: + AlbertTokenizer = None + +logger = logging.get_logger(__name__) +VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"} + +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": { + "albert-base-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-spiece.model", + "albert-large-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v1-spiece.model", + "albert-xlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v1-spiece.model", + "albert-xxlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v1-spiece.model", + "albert-base-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-spiece.model", + "albert-large-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-spiece.model", + "albert-xlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-spiece.model", + "albert-xxlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-spiece.model", + }, + "tokenizer_file": { + "albert-base-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tokenizer.json", + "albert-large-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v1-tokenizer.json", + "albert-xlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v1-tokenizer.json", + "albert-xxlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v1-tokenizer.json", + "albert-base-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-tokenizer.json", + "albert-large-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-tokenizer.json", + "albert-xlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-tokenizer.json", + "albert-xxlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-tokenizer.json", + }, +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + "albert-base-v1": 512, + "albert-large-v1": 512, + "albert-xlarge-v1": 512, + "albert-xxlarge-v1": 512, + "albert-base-v2": 512, + "albert-large-v2": 512, + "albert-xlarge-v2": 512, + "albert-xxlarge-v2": 512, +} + +SPIECE_UNDERLINE = "▁" + + +class AlbertTokenizerFast(PreTrainedTokenizerFast): + """ + Construct a "fast" ALBERT tokenizer (backed by HuggingFace's `tokenizers` library). Based on + `SentencePiece `__. + This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main + methods. Users should refer to this superclass for more information regarding those methods. + Args: + vocab_file (:obj:`str`): + `SentencePiece `__ file (generally has a `.spm` extension) that + contains the vocabulary necessary to instantiate a tokenizer. + do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether or not to lowercase the input when tokenizing. + remove_space (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether or not to strip the text when tokenizing (removing excess spaces before and after the string). + keep_accents (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to keep accents when tokenizing. + bos_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`): + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. + .. note:: + When building a sequence using special tokens, this is not the token that is used for the beginning + of sequence. The token used is the :obj:`cls_token`. + eos_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`): + The end of sequence token. + .. note:: + When building a sequence using special tokens, this is not the token that is used for the end + of sequence. The token used is the :obj:`sep_token`. + unk_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences + for sequence classification or for a text and a question for question answering. + It is also used as the last token of a sequence built with special tokens. + pad_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The token used for padding, for example when batching sequences of different lengths. + cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`): + The classifier token which is used when doing sequence classification (classification of the whole + sequence instead of per-token classification). It is the first token of the sequence when built with + special tokens. + mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + Attributes: + sp_model (:obj:`SentencePieceProcessor`): + The `SentencePiece` processor that is used for every conversion (string, tokens and IDs). + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + slow_tokenizer_class = AlbertTokenizer + + def __init__( + self, + vocab_file, + tokenizer_file=None, + do_lower_case=True, + remove_space=True, + keep_accents=False, + bos_token="[CLS]", + eos_token="[SEP]", + unk_token="", + sep_token="[SEP]", + pad_token="", + cls_token="[CLS]", + mask_token="[MASK]", + **kwargs + ): + super().__init__( + vocab_file, + tokenizer_file=tokenizer_file, + do_lower_case=do_lower_case, + remove_space=remove_space, + keep_accents=keep_accents, + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + **kwargs, + ) + + self.do_lower_case = do_lower_case + self.remove_space = remove_space + self.keep_accents = keep_accents + self.vocab_file = vocab_file + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks + by concatenating and adding special tokens. + An ALBERT sequence has the following format: + + - single sequence: ``[CLS] X [SEP]`` + - pair of sequences: ``[CLS] A [SEP] B [SEP]`` + + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs to which the special tokens will be added + token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): + Optional second list of IDs for sequence pairs. + + Returns: + :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + if token_ids_1 is None: + return cls + token_ids_0 + sep + return cls + token_ids_0 + sep + token_ids_1 + sep + + def get_special_tokens_mask( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False + ) -> List[int]: + """ + Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer ``prepare_for_model`` method. + + Args: + token_ids_0 (:obj:`List[int]`): + List of ids. + token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): + Set to True if the token list is already formatted with special tokens for the model + + Returns: + :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + + if already_has_special_tokens: + if token_ids_1 is not None: + raise ValueError( + "You should not supply a second sequence if the provided sequence of " + "ids is already formatted with special tokens for the model." + ) + return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) + + if token_ids_1 is not None: + return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1] + + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Creates a mask from the two sequences passed to be used in a sequence-pair classification task. + An ALBERT sequence pair mask has the following format: + + :: + + 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 + | first sequence | second sequence | + + if token_ids_1 is None, only returns the first portion of the mask (0s). + + Args: + token_ids_0 (:obj:`List[int]`): + List of ids. + token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): + Optional second list of IDs for sequence pairs. + + Returns: + :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given + sequence(s). + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + if not os.path.isdir(save_directory): + logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): + copyfile(self.vocab_file, out_vocab_file) + + return (out_vocab_file,) diff --git a/src/transformers/tokenization_auto.py b/src/transformers/tokenization_auto.py index f833791dcee..018deb90b9e 100644 --- a/src/transformers/tokenization_auto.py +++ b/src/transformers/tokenization_auto.py @@ -56,45 +56,108 @@ from .configuration_auto import ( replace_list_option_in_docstrings, ) from .configuration_utils import PretrainedConfig -from .tokenization_albert import AlbertTokenizer, AlbertTokenizerFast -from .tokenization_bart import BartTokenizer, BartTokenizerFast -from .tokenization_bert import BertTokenizer, BertTokenizerFast -from .tokenization_bert_generation import BertGenerationTokenizer +from .file_utils import is_sentencepiece_available, is_tokenizers_available +from .tokenization_bart import BartTokenizer +from .tokenization_bert import BertTokenizer from .tokenization_bert_japanese import BertJapaneseTokenizer from .tokenization_bertweet import BertweetTokenizer from .tokenization_blenderbot import BlenderbotSmallTokenizer -from .tokenization_camembert import CamembertTokenizer, CamembertTokenizerFast from .tokenization_ctrl import CTRLTokenizer from .tokenization_deberta import DebertaTokenizer -from .tokenization_distilbert import DistilBertTokenizer, DistilBertTokenizerFast -from .tokenization_dpr import DPRQuestionEncoderTokenizer, DPRQuestionEncoderTokenizerFast -from .tokenization_electra import ElectraTokenizer, ElectraTokenizerFast +from .tokenization_distilbert import DistilBertTokenizer +from .tokenization_dpr import DPRQuestionEncoderTokenizer +from .tokenization_electra import ElectraTokenizer from .tokenization_flaubert import FlaubertTokenizer from .tokenization_fsmt import FSMTTokenizer -from .tokenization_funnel import FunnelTokenizer, FunnelTokenizerFast -from .tokenization_gpt2 import GPT2Tokenizer, GPT2TokenizerFast -from .tokenization_layoutlm import LayoutLMTokenizer, LayoutLMTokenizerFast -from .tokenization_longformer import LongformerTokenizer, LongformerTokenizerFast -from .tokenization_lxmert import LxmertTokenizer, LxmertTokenizerFast -from .tokenization_marian import MarianTokenizer -from .tokenization_mbart import MBartTokenizer, MBartTokenizerFast -from .tokenization_mobilebert import MobileBertTokenizer, MobileBertTokenizerFast -from .tokenization_openai import OpenAIGPTTokenizer, OpenAIGPTTokenizerFast -from .tokenization_pegasus import PegasusTokenizer, PegasusTokenizerFast +from .tokenization_funnel import FunnelTokenizer +from .tokenization_gpt2 import GPT2Tokenizer +from .tokenization_layoutlm import LayoutLMTokenizer +from .tokenization_longformer import LongformerTokenizer +from .tokenization_lxmert import LxmertTokenizer +from .tokenization_mobilebert import MobileBertTokenizer +from .tokenization_openai import OpenAIGPTTokenizer from .tokenization_phobert import PhobertTokenizer from .tokenization_rag import RagTokenizer -from .tokenization_reformer import ReformerTokenizer, ReformerTokenizerFast -from .tokenization_retribert import RetriBertTokenizer, RetriBertTokenizerFast -from .tokenization_roberta import RobertaTokenizer, RobertaTokenizerFast -from .tokenization_squeezebert import SqueezeBertTokenizer, SqueezeBertTokenizerFast -from .tokenization_t5 import T5Tokenizer, T5TokenizerFast +from .tokenization_retribert import RetriBertTokenizer +from .tokenization_roberta import RobertaTokenizer +from .tokenization_squeezebert import SqueezeBertTokenizer from .tokenization_transfo_xl import TransfoXLTokenizer from .tokenization_xlm import XLMTokenizer -from .tokenization_xlm_roberta import XLMRobertaTokenizer, XLMRobertaTokenizerFast -from .tokenization_xlnet import XLNetTokenizer, XLNetTokenizerFast from .utils import logging +if is_sentencepiece_available(): + from .tokenization_albert import AlbertTokenizer + from .tokenization_bert_generation import BertGenerationTokenizer + from .tokenization_camembert import CamembertTokenizer + from .tokenization_marian import MarianTokenizer + from .tokenization_mbart import MBartTokenizer + from .tokenization_pegasus import PegasusTokenizer + from .tokenization_reformer import ReformerTokenizer + from .tokenization_t5 import T5Tokenizer + from .tokenization_xlm_roberta import XLMRobertaTokenizer + from .tokenization_xlnet import XLNetTokenizer +else: + AlbertTokenizer = None + BertGenerationTokenizer = None + CamembertTokenizer = None + MarianTokenizer = None + MBartTokenizer = None + PegasusTokenizer = None + ReformerTokenizer = None + T5Tokenizer = None + XLMRobertaTokenizer = None + XLNetTokenizer = None + +if is_tokenizers_available(): + from .tokenization_albert_fast import AlbertTokenizerFast + from .tokenization_bart_fast import BartTokenizerFast + from .tokenization_bert_fast import BertTokenizerFast + from .tokenization_camembert_fast import CamembertTokenizerFast + from .tokenization_distilbert_fast import DistilBertTokenizerFast + from .tokenization_dpr_fast import DPRQuestionEncoderTokenizerFast + from .tokenization_electra_fast import ElectraTokenizerFast + from .tokenization_funnel_fast import FunnelTokenizerFast + from .tokenization_gpt2_fast import GPT2TokenizerFast + from .tokenization_layoutlm_fast import LayoutLMTokenizerFast + from .tokenization_longformer_fast import LongformerTokenizerFast + from .tokenization_lxmert_fast import LxmertTokenizerFast + from .tokenization_mbart_fast import MBartTokenizerFast + from .tokenization_mobilebert_fast import MobileBertTokenizerFast + from .tokenization_openai_fast import OpenAIGPTTokenizerFast + from .tokenization_pegasus_fast import PegasusTokenizerFast + from .tokenization_reformer_fast import ReformerTokenizerFast + from .tokenization_retribert_fast import RetriBertTokenizerFast + from .tokenization_roberta_fast import RobertaTokenizerFast + from .tokenization_squeezebert_fast import SqueezeBertTokenizerFast + from .tokenization_t5_fast import T5TokenizerFast + from .tokenization_xlm_roberta_fast import XLMRobertaTokenizerFast + from .tokenization_xlnet_fast import XLNetTokenizerFast +else: + AlbertTokenizerFast = None + BartTokenizerFast = None + BertTokenizerFast = None + CamembertTokenizerFast = None + DistilBertTokenizerFast = None + DPRQuestionEncoderTokenizerFast = None + ElectraTokenizerFast = None + FunnelTokenizerFast = None + GPT2TokenizerFast = None + LayoutLMTokenizerFast = None + LongformerTokenizerFast = None + LxmertTokenizerFast = None + MBartTokenizerFast = None + MobileBertTokenizerFast = None + OpenAIGPTTokenizerFast = None + PegasusTokenizerFast = None + ReformerTokenizerFast = None + RetriBertTokenizerFast = None + RobertaTokenizerFast = None + SqueezeBertTokenizerFast = None + T5TokenizerFast = None + XLMRobertaTokenizerFast = None + XLNetTokenizerFast = None + logger = logging.get_logger(__name__) @@ -111,7 +174,7 @@ TOKENIZER_MAPPING = OrderedDict( (XLMRobertaConfig, (XLMRobertaTokenizer, XLMRobertaTokenizerFast)), (MarianConfig, (MarianTokenizer, None)), (BlenderbotConfig, (BlenderbotSmallTokenizer, None)), - (LongformerConfig, (LongformerTokenizer, None)), + (LongformerConfig, (LongformerTokenizer, LongformerTokenizerFast)), (BartConfig, (BartTokenizer, BartTokenizerFast)), (LongformerConfig, (LongformerTokenizer, LongformerTokenizerFast)), (RobertaConfig, (BertweetTokenizer, None)), @@ -139,7 +202,11 @@ TOKENIZER_MAPPING = OrderedDict( ] ) -SLOW_TOKENIZER_MAPPING = {k: v[0] for k, v in TOKENIZER_MAPPING.items()} +SLOW_TOKENIZER_MAPPING = { + k: (v[0] if v[0] is not None else v[1]) + for k, v in TOKENIZER_MAPPING.items() + if (v[0] is not None or v[1] is not None) +} class AutoTokenizer: @@ -254,7 +321,7 @@ class AutoTokenizer: if type(config) in TOKENIZER_MAPPING.keys(): tokenizer_class_py, tokenizer_class_fast = TOKENIZER_MAPPING[type(config)] - if tokenizer_class_fast and use_fast: + if tokenizer_class_fast and (use_fast or tokenizer_class_py is None): return tokenizer_class_fast.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) else: return tokenizer_class_py.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) diff --git a/src/transformers/tokenization_bart.py b/src/transformers/tokenization_bart.py index 40fe7c0e9e6..47fdb1218bf 100644 --- a/src/transformers/tokenization_bart.py +++ b/src/transformers/tokenization_bart.py @@ -15,7 +15,7 @@ from typing import List, Optional -from .tokenization_roberta import RobertaTokenizer, RobertaTokenizerFast +from .tokenization_roberta import RobertaTokenizer from .tokenization_utils_base import BatchEncoding from .utils import logging @@ -154,114 +154,3 @@ class BartTokenizer(RobertaTokenizer): )["input_ids"] model_inputs["labels"] = labels return model_inputs - - -class BartTokenizerFast(RobertaTokenizerFast): - # merges and vocab same as Roberta - max_model_input_sizes = {m: 1024 for m in _all_bart_models} - pretrained_vocab_files_map = { - "vocab_file": {m: vocab_url for m in _all_bart_models}, - "merges_file": {m: merges_url for m in _all_bart_models}, - } - slow_tokenizer_class = BartTokenizer - - def prepare_seq2seq_batch( - self, - src_texts: List[str], - tgt_texts: Optional[List[str]] = None, - max_length: Optional[int] = None, - max_target_length: Optional[int] = None, - padding: str = "longest", - return_tensors: str = "None", - truncation=True, - **kwargs, - ) -> BatchEncoding: - r""" - - Prepare a batch that can be passed directly to an instance of :class:`~transformers.BartModel`. - - Args: - src_texts: (:obj:`List[str]`): - List of documents to summarize or source language texts. - tgt_texts: (:obj:`List[str]`, `optional`): - List of summaries or target language texts. - max_length (:obj:`int`, `optional`): - Controls the maximum length for encoder inputs (documents to summarize or source language texts). - If left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum - length is required by one of the truncation/padding parameters. If the model has no specific maximum - input length (like XLNet) truncation/padding to a maximum length will be deactivated. - max_target_length (:obj:`int`, `optional`): - Controls the maximum length of decoder inputs (target language texts or summaries). - If left unset or set to :obj:`None`, this will use the max_length value. - padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`): - Activates and controls padding. Accepts the following values: - - * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a - single sequence if provided). - * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the - maximum acceptable input length for the model if that argument is not provided. - * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of - different lengths). - return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`, defaults to "pt"): - If set, will return tensors instead of list of python integers. Acceptable values are: - - * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects. - * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects. - * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects. - truncation (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.TruncationStrategy`, `optional`, defaults to :obj:`True`): - Activates and controls truncation. Accepts the following values: - - * :obj:`True` or :obj:`'longest_first'`: Truncate to a maximum length specified with the argument - :obj:`max_length` or to the maximum acceptable input length for the model if that argument is not - provided. This will truncate token by token, removing a token from the longest sequence in the pair - if a pair of sequences (or a batch of pairs) is provided. - * :obj:`'only_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to - the maximum acceptable input length for the model if that argument is not provided. This will only - truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided. - * :obj:`'only_second'`: Truncate to a maximum length specified with the argument :obj:`max_length` or - to the maximum acceptable input length for the model if that argument is not provided. This will only - truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided. - * :obj:`False` or :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with - sequence lengths greater than the model maximum admissible input size). - **kwargs: - Additional keyword arguments passed along to :obj:`self.__call__`. - - Returns: - :class:`~transformers.BatchEncoding`: A :class:`~transformers.BatchEncoding` with the following fields: - - - **input_ids** -- List of token ids to be fed to the encoder. - - **attention_mask** -- List of indices specifying which tokens should be attended to by the model. - - **decoder_input_ids** -- List of token ids to be fed to the decoder. - - **decoder_attention_mask** -- List of indices specifying which tokens should be attended to by the decoder. - This does not include causal mask, which is built by the model. - - The full set of keys ``[input_ids, attention_mask, decoder_input_ids, decoder_attention_mask]``, - will only be returned if tgt_texts is passed. Otherwise, input_ids, attention_mask will be the only keys. - """ - if max_length is None: - max_length = self.model_max_length - model_inputs: BatchEncoding = self( - src_texts, - add_special_tokens=True, - return_tensors=return_tensors, - max_length=max_length, - padding=padding, - truncation=truncation, - **kwargs, - ) - if tgt_texts is None: - return model_inputs - # Process tgt_texts - if max_target_length is None: - max_target_length = max_length - labels = self( - tgt_texts, - add_special_tokens=True, - return_tensors=return_tensors, - padding=padding, - max_length=max_target_length, - truncation=truncation, - **kwargs, - )["input_ids"] - model_inputs["labels"] = labels - return model_inputs diff --git a/src/transformers/tokenization_bart_fast.py b/src/transformers/tokenization_bart_fast.py new file mode 100644 index 00000000000..d86028398b4 --- /dev/null +++ b/src/transformers/tokenization_bart_fast.py @@ -0,0 +1,151 @@ +# coding=utf-8 +# Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List, Optional + +from .tokenization_bart import BartTokenizer +from .tokenization_roberta_fast import RobertaTokenizerFast +from .tokenization_utils_base import BatchEncoding +from .utils import logging + + +logger = logging.get_logger(__name__) + + +# vocab and merges same as roberta +vocab_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json" +merges_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt" +tokenizer_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-tokenizer.json" +_all_bart_models = [ + "facebook/bart-base", + "facebook/bart-large", + "facebook/bart-large-mnli", + "facebook/bart-large-cnn", + "facebook/bart-large-xsum", + "yjernite/bart_eli5", + # This is not exhaustive: see https://huggingface.co/models?filter=bart +] + + +class BartTokenizerFast(RobertaTokenizerFast): + # merges and vocab same as Roberta + max_model_input_sizes = {m: 1024 for m in _all_bart_models} + pretrained_vocab_files_map = { + "vocab_file": {m: vocab_url for m in _all_bart_models}, + "merges_file": {m: merges_url for m in _all_bart_models}, + "tokenizer_file": {m: tokenizer_url for m in _all_bart_models}, + } + slow_tokenizer_class = BartTokenizer + + def prepare_seq2seq_batch( + self, + src_texts: List[str], + tgt_texts: Optional[List[str]] = None, + max_length: Optional[int] = None, + max_target_length: Optional[int] = None, + padding: str = "longest", + return_tensors: str = "None", + truncation=True, + **kwargs, + ) -> BatchEncoding: + r""" + + Prepare a batch that can be passed directly to an instance of :class:`~transformers.BartModel`. + + Args: + src_texts: (:obj:`List[str]`): + List of documents to summarize or source language texts. + tgt_texts: (:obj:`List[str]`, `optional`): + List of summaries or target language texts. + max_length (:obj:`int`, `optional`): + Controls the maximum length for encoder inputs (documents to summarize or source language texts). + If left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum + length is required by one of the truncation/padding parameters. If the model has no specific maximum + input length (like XLNet) truncation/padding to a maximum length will be deactivated. + max_target_length (:obj:`int`, `optional`): + Controls the maximum length of decoder inputs (target language texts or summaries). + If left unset or set to :obj:`None`, this will use the max_length value. + padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`): + Activates and controls padding. Accepts the following values: + + * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a + single sequence if provided). + * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the + maximum acceptable input length for the model if that argument is not provided. + * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of + different lengths). + return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`, defaults to "pt"): + If set, will return tensors instead of list of python integers. Acceptable values are: + + * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects. + * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects. + * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects. + truncation (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.TruncationStrategy`, `optional`, defaults to :obj:`True`): + Activates and controls truncation. Accepts the following values: + + * :obj:`True` or :obj:`'longest_first'`: Truncate to a maximum length specified with the argument + :obj:`max_length` or to the maximum acceptable input length for the model if that argument is not + provided. This will truncate token by token, removing a token from the longest sequence in the pair + if a pair of sequences (or a batch of pairs) is provided. + * :obj:`'only_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to + the maximum acceptable input length for the model if that argument is not provided. This will only + truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided. + * :obj:`'only_second'`: Truncate to a maximum length specified with the argument :obj:`max_length` or + to the maximum acceptable input length for the model if that argument is not provided. This will only + truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided. + * :obj:`False` or :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with + sequence lengths greater than the model maximum admissible input size). + **kwargs: + Additional keyword arguments passed along to :obj:`self.__call__`. + + Returns: + :class:`~transformers.BatchEncoding`: A :class:`~transformers.BatchEncoding` with the following fields: + + - **input_ids** -- List of token ids to be fed to the encoder. + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model. + - **decoder_input_ids** -- List of token ids to be fed to the decoder. + - **decoder_attention_mask** -- List of indices specifying which tokens should be attended to by the decoder. + This does not include causal mask, which is built by the model. + + The full set of keys ``[input_ids, attention_mask, decoder_input_ids, decoder_attention_mask]``, + will only be returned if tgt_texts is passed. Otherwise, input_ids, attention_mask will be the only keys. + """ + if max_length is None: + max_length = self.model_max_length + model_inputs: BatchEncoding = self( + src_texts, + add_special_tokens=True, + return_tensors=return_tensors, + max_length=max_length, + padding=padding, + truncation=truncation, + **kwargs, + ) + if tgt_texts is None: + return model_inputs + # Process tgt_texts + if max_target_length is None: + max_target_length = max_length + labels = self( + tgt_texts, + add_special_tokens=True, + return_tensors=return_tensors, + padding=padding, + max_length=max_target_length, + truncation=truncation, + **kwargs, + )["input_ids"] + model_inputs["labels"] = labels + return model_inputs diff --git a/src/transformers/tokenization_bert.py b/src/transformers/tokenization_bert.py index 3e646f87747..dbcc117dd72 100644 --- a/src/transformers/tokenization_bert.py +++ b/src/transformers/tokenization_bert.py @@ -12,16 +12,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Tokenization classes.""" +"""Tokenization classes for Bert.""" import collections import os import unicodedata -from typing import List, Optional +from typing import List, Optional, Tuple from .tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace -from .tokenization_utils_fast import PreTrainedTokenizerFast from .utils import logging @@ -329,22 +328,14 @@ class BertTokenizer(PreTrainedTokenizer): return len(cls + token_ids_0 + sep) * [0] return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] - def save_vocabulary(self, vocab_path): - """ - Save the vocabulary and special tokens file to a directory. - - Args: - vocab_path (:obj:`str`): - The directory in which to save the vocabulary. - - Returns: - :obj:`Tuple(str)`: Paths to the files saved. - """ + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: index = 0 - if os.path.isdir(vocab_path): - vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["vocab_file"]) + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) else: - vocab_file = vocab_path + vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory with open(vocab_file, "w", encoding="utf-8") as writer: for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): if index != token_index: @@ -565,135 +556,3 @@ class WordpieceTokenizer(object): else: output_tokens.extend(sub_tokens) return output_tokens - - -class BertTokenizerFast(PreTrainedTokenizerFast): - r""" - Construct a "fast" BERT tokenizer (backed by HuggingFace's `tokenizers` library). Based on WordPiece. - - This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main - methods. Users should refer to this superclass for more information regarding those methods. - - Args: - vocab_file (:obj:`str`): - File containing the vocabulary. - do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`): - Whether or not to lowercase the input when tokenizing. - unk_token (:obj:`str`, `optional`, defaults to :obj:`"[UNK]"`): - The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this - token instead. - sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`): - The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences - for sequence classification or for a text and a question for question answering. - It is also used as the last token of a sequence built with special tokens. - pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`): - The token used for padding, for example when batching sequences of different lengths. - cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`): - The classifier token which is used when doing sequence classification (classification of the whole - sequence instead of per-token classification). It is the first token of the sequence when built with - special tokens. - mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`): - The token used for masking values. This is the token used when training this model with masked language - modeling. This is the token which the model will try to predict. - clean_text (:obj:`bool`, `optional`, defaults to :obj:`True`): - Whether or not to clean the text before tokenization by removing any control characters and - replacing all whitespaces by the classic one. - tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`): - Whether or not to tokenize Chinese characters. - This should likely be deactivated for Japanese (see `this issue - `__). - strip_accents: (:obj:`bool`, `optional`): - Whether or not to strip all accents. If this option is not specified, then it will be determined by the - value for :obj:`lowercase` (as in the original BERT). - wordpieces_prefix: (:obj:`str`, `optional`, defaults to :obj:`"##"`): - The prefix for subwords. - """ - - vocab_files_names = VOCAB_FILES_NAMES - pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP - pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION - max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - slow_tokenizer_class = BertTokenizer - - def __init__( - self, - vocab_file, - do_lower_case=True, - unk_token="[UNK]", - sep_token="[SEP]", - pad_token="[PAD]", - cls_token="[CLS]", - mask_token="[MASK]", - tokenize_chinese_chars=True, - strip_accents=None, - **kwargs - ): - super().__init__( - vocab_file, - do_lower_case=do_lower_case, - unk_token=unk_token, - sep_token=sep_token, - pad_token=pad_token, - cls_token=cls_token, - mask_token=mask_token, - tokenize_chinese_chars=tokenize_chinese_chars, - strip_accents=strip_accents, - **kwargs, - ) - - self.do_lower_case = do_lower_case - - def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): - """ - Build model inputs from a sequence or a pair of sequence for sequence classification tasks - by concatenating and adding special tokens. - A BERT sequence has the following format: - - - single sequence: ``[CLS] X [SEP]`` - - pair of sequences: ``[CLS] A [SEP] B [SEP]`` - - Args: - token_ids_0 (:obj:`List[int]`): - List of IDs to which the special tokens will be added. - token_ids_1 (:obj:`List[int]`, `optional`): - Optional second list of IDs for sequence pairs. - - Returns: - :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. - """ - output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id] - - if token_ids_1: - output += token_ids_1 + [self.sep_token_id] - - return output - - def create_token_type_ids_from_sequences( - self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None - ) -> List[int]: - """ - Create a mask from the two sequences passed to be used in a sequence-pair classification task. - A BERT sequence pair mask has the following format: - - :: - - 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 - | first sequence | second sequence | - - If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s). - - Args: - token_ids_0 (:obj:`List[int]`): - List of IDs. - token_ids_1 (:obj:`List[int]`, `optional`): - Optional second list of IDs for sequence pairs. - - Returns: - :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given - sequence(s). - """ - sep = [self.sep_token_id] - cls = [self.cls_token_id] - if token_ids_1 is None: - return len(cls + token_ids_0 + sep) * [0] - return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] diff --git a/src/transformers/tokenization_bert_fast.py b/src/transformers/tokenization_bert_fast.py new file mode 100644 index 00000000000..9a9769a15e7 --- /dev/null +++ b/src/transformers/tokenization_bert_fast.py @@ -0,0 +1,262 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Fast Tokenization classes for Bert.""" + +import json +from typing import List, Optional, Tuple + +from tokenizers import normalizers + +from .tokenization_bert import BertTokenizer +from .tokenization_utils_fast import PreTrainedTokenizerFast +from .utils import logging + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"} + +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": { + "bert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt", + "bert-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt", + "bert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt", + "bert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt", + "bert-base-multilingual-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt", + "bert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt", + "bert-base-chinese": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt", + "bert-base-german-cased": "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-vocab.txt", + "bert-large-uncased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-vocab.txt", + "bert-large-cased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-vocab.txt", + "bert-large-uncased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-vocab.txt", + "bert-large-cased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-vocab.txt", + "bert-base-cased-finetuned-mrpc": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-vocab.txt", + "bert-base-german-dbmdz-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-vocab.txt", + "bert-base-german-dbmdz-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-vocab.txt", + "TurkuNLP/bert-base-finnish-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/vocab.txt", + "TurkuNLP/bert-base-finnish-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/vocab.txt", + "wietsedv/bert-base-dutch-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/wietsedv/bert-base-dutch-cased/vocab.txt", + }, + "tokenizer_file": { + "bert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-tokenizer.json", + "bert-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-tokenizer.json", + "bert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-tokenizer.json", + "bert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-tokenizer.json", + "bert-base-multilingual-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-tokenizer.json", + "bert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-tokenizer.json", + "bert-base-chinese": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-tokenizer.json", + "bert-base-german-cased": "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-tokenizer.json", + "bert-large-uncased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-tokenizer.json", + "bert-large-cased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-tokenizer.json", + "bert-large-uncased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-tokenizer.json", + "bert-large-cased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-tokenizer.json", + "bert-base-cased-finetuned-mrpc": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-tokenizer.json", + "bert-base-german-dbmdz-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-tokenizer.json", + "bert-base-german-dbmdz-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-tokenizer.json", + "TurkuNLP/bert-base-finnish-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/tokenizer.json", + "TurkuNLP/bert-base-finnish-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/tokenizer.json", + "wietsedv/bert-base-dutch-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/wietsedv/bert-base-dutch-cased/tokenizer.json", + }, +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + "bert-base-uncased": 512, + "bert-large-uncased": 512, + "bert-base-cased": 512, + "bert-large-cased": 512, + "bert-base-multilingual-uncased": 512, + "bert-base-multilingual-cased": 512, + "bert-base-chinese": 512, + "bert-base-german-cased": 512, + "bert-large-uncased-whole-word-masking": 512, + "bert-large-cased-whole-word-masking": 512, + "bert-large-uncased-whole-word-masking-finetuned-squad": 512, + "bert-large-cased-whole-word-masking-finetuned-squad": 512, + "bert-base-cased-finetuned-mrpc": 512, + "bert-base-german-dbmdz-cased": 512, + "bert-base-german-dbmdz-uncased": 512, + "TurkuNLP/bert-base-finnish-cased-v1": 512, + "TurkuNLP/bert-base-finnish-uncased-v1": 512, + "wietsedv/bert-base-dutch-cased": 512, +} + +PRETRAINED_INIT_CONFIGURATION = { + "bert-base-uncased": {"do_lower_case": True}, + "bert-large-uncased": {"do_lower_case": True}, + "bert-base-cased": {"do_lower_case": False}, + "bert-large-cased": {"do_lower_case": False}, + "bert-base-multilingual-uncased": {"do_lower_case": True}, + "bert-base-multilingual-cased": {"do_lower_case": False}, + "bert-base-chinese": {"do_lower_case": False}, + "bert-base-german-cased": {"do_lower_case": False}, + "bert-large-uncased-whole-word-masking": {"do_lower_case": True}, + "bert-large-cased-whole-word-masking": {"do_lower_case": False}, + "bert-large-uncased-whole-word-masking-finetuned-squad": {"do_lower_case": True}, + "bert-large-cased-whole-word-masking-finetuned-squad": {"do_lower_case": False}, + "bert-base-cased-finetuned-mrpc": {"do_lower_case": False}, + "bert-base-german-dbmdz-cased": {"do_lower_case": False}, + "bert-base-german-dbmdz-uncased": {"do_lower_case": True}, + "TurkuNLP/bert-base-finnish-cased-v1": {"do_lower_case": False}, + "TurkuNLP/bert-base-finnish-uncased-v1": {"do_lower_case": True}, + "wietsedv/bert-base-dutch-cased": {"do_lower_case": False}, +} + + +class BertTokenizerFast(PreTrainedTokenizerFast): + r""" + Construct a "fast" BERT tokenizer (backed by HuggingFace's `tokenizers` library). Based on WordPiece. + + This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main + methods. Users should refer to this superclass for more information regarding those methods. + + Args: + vocab_file (:obj:`str`): + File containing the vocabulary. + do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether or not to lowercase the input when tokenizing. + unk_token (:obj:`str`, `optional`, defaults to :obj:`"[UNK]"`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences + for sequence classification or for a text and a question for question answering. + It is also used as the last token of a sequence built with special tokens. + pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`): + The token used for padding, for example when batching sequences of different lengths. + cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`): + The classifier token which is used when doing sequence classification (classification of the whole + sequence instead of per-token classification). It is the first token of the sequence when built with + special tokens. + mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + clean_text (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether or not to clean the text before tokenization by removing any control characters and + replacing all whitespaces by the classic one. + tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether or not to tokenize Chinese characters. + This should likely be deactivated for Japanese (see `this issue + `__). + strip_accents: (:obj:`bool`, `optional`): + Whether or not to strip all accents. If this option is not specified, then it will be determined by the + value for :obj:`lowercase` (as in the original BERT). + wordpieces_prefix: (:obj:`str`, `optional`, defaults to :obj:`"##"`): + The prefix for subwords. + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + slow_tokenizer_class = BertTokenizer + + def __init__( + self, + vocab_file, + tokenizer_file=None, + do_lower_case=True, + unk_token="[UNK]", + sep_token="[SEP]", + pad_token="[PAD]", + cls_token="[CLS]", + mask_token="[MASK]", + tokenize_chinese_chars=True, + strip_accents=None, + **kwargs + ): + super().__init__( + vocab_file, + tokenizer_file=tokenizer_file, + do_lower_case=do_lower_case, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + tokenize_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents, + **kwargs, + ) + + pre_tok_state = json.loads(self.backend_tokenizer.normalizer.__getstate__()) + if ( + pre_tok_state.get("do_lower_case", do_lower_case) != do_lower_case + or pre_tok_state.get("strip_accents", strip_accents) != strip_accents + ): + pre_tok_class = getattr(normalizers, pre_tok_state.pop("type")) + pre_tok_state["do_lower_case"] = do_lower_case + pre_tok_state["strip_accents"] = strip_accents + self.backend_tokenizer.normalizer = pre_tok_class(**pre_tok_state) + + self.do_lower_case = do_lower_case + + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks + by concatenating and adding special tokens. + A BERT sequence has the following format: + + - single sequence: ``[CLS] X [SEP]`` + - pair of sequences: ``[CLS] A [SEP] B [SEP]`` + + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (:obj:`List[int]`, `optional`): + Optional second list of IDs for sequence pairs. + + Returns: + :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. + """ + output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + + if token_ids_1: + output += token_ids_1 + [self.sep_token_id] + + return output + + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. + A BERT sequence pair mask has the following format: + + :: + + 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 + | first sequence | second sequence | + + If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s). + + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs. + token_ids_1 (:obj:`List[int]`, `optional`): + Optional second list of IDs for sequence pairs. + + Returns: + :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given + sequence(s). + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + files = self._tokenizer.model.save(save_directory, name=filename_prefix) + return tuple(files) diff --git a/src/transformers/tokenization_bert_generation.py b/src/transformers/tokenization_bert_generation.py index fac4153fd8c..fe7b7496019 100644 --- a/src/transformers/tokenization_bert_generation.py +++ b/src/transformers/tokenization_bert_generation.py @@ -17,7 +17,9 @@ import os from shutil import copyfile -from typing import List +from typing import List, Optional, Tuple + +import sentencepiece as spm from .tokenization_utils import PreTrainedTokenizer from .utils import logging @@ -55,6 +57,8 @@ class BertGenerationTokenizer(PreTrainedTokenizer): """ vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = {"vocab_file": {"bert_for_seq_generation": tokenizer_url}} + max_model_input_sizes = {"bert_for_seq_generation": 512} prefix_tokens: List[int] = [] def __init__( @@ -77,16 +81,6 @@ class BertGenerationTokenizer(PreTrainedTokenizer): **kwargs, ) - try: - import sentencepiece as spm - except ImportError: - logger.warning( - "You need to install SentencePiece to use T5Tokenizer:" - "https://github.com/google/sentencepiece" - "pip install sentencepiece" - ) - raise - self.vocab_file = vocab_file self.sp_model = spm.SentencePieceProcessor() @@ -108,14 +102,6 @@ class BertGenerationTokenizer(PreTrainedTokenizer): def __setstate__(self, d): self.__dict__ = d - try: - import sentencepiece as spm - except ImportError: - logger.warning( - "You need to install SentencePiece to use BertGenerationTokenizer: https://github.com/google/sentencepiece" - "pip install sentencepiece" - ) - raise self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(self.vocab_file) @@ -141,21 +127,13 @@ class BertGenerationTokenizer(PreTrainedTokenizer): out_string = self.sp_model.decode_pieces(tokens) return out_string - def save_vocabulary(self, save_directory): - """ - Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory. - - Args: - save_directory (:obj:`str`): - The directory in which to save the vocabulary. - - Returns: - :obj:`Tuple(str)`: Paths to the files saved. - """ + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: if not os.path.isdir(save_directory): logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) return - out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"]) + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): copyfile(self.vocab_file, out_vocab_file) diff --git a/src/transformers/tokenization_bertweet.py b/src/transformers/tokenization_bertweet.py index b5cd4faaf2a..d846cb6c262 100644 --- a/src/transformers/tokenization_bertweet.py +++ b/src/transformers/tokenization_bertweet.py @@ -20,7 +20,7 @@ import html import os import re from shutil import copyfile -from typing import List, Optional +from typing import List, Optional, Tuple import regex @@ -383,22 +383,16 @@ class BertweetTokenizer(PreTrainedTokenizer): out_string = " ".join(tokens).replace("@@ ", "").strip() return out_string - def save_vocabulary(self, save_directory): - """ - Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory. - - Args: - save_directory (:obj:`str`): - The directory in which to save the vocabulary. - - Returns: - :obj:`Tuple(str)`: Paths to the files saved. - """ + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: if not os.path.isdir(save_directory): logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) return - out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"]) - out_merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES["merges_file"]) + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + out_merge_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"] + ) if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): copyfile(self.vocab_file, out_vocab_file) diff --git a/src/transformers/tokenization_blenderbot.py b/src/transformers/tokenization_blenderbot.py index 2aee1564426..6d3dc356662 100644 --- a/src/transformers/tokenization_blenderbot.py +++ b/src/transformers/tokenization_blenderbot.py @@ -17,7 +17,7 @@ """"BlenderbotTokenizer and BlenderbotSmallTokenizer""" import json import os -from typing import Dict, List, Tuple +from typing import Dict, List, Optional, Tuple import regex as re @@ -235,22 +235,16 @@ class BlenderbotSmallTokenizer(PreTrainedTokenizer): out_string = " ".join(tokens).replace("@@ ", "").strip() return out_string - def save_vocabulary(self, save_directory: str) -> Tuple[str, str]: - """ - Save the vocabulary and special tokens file to a directory. - - Args: - save_directory (:obj:`str`): - The directory in which to save the vocabulary. - - Returns: - :obj:`Tuple(str)`: Paths to the files saved. - """ + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: if not os.path.isdir(save_directory): logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) return - vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"]) - merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES["merges_file"]) + vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + merge_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"] + ) with open(vocab_file, "w", encoding="utf-8") as f: f.write(json.dumps(self.encoder, ensure_ascii=False)) diff --git a/src/transformers/tokenization_camembert.py b/src/transformers/tokenization_camembert.py index 2726ce1e160..908cdc32ad0 100644 --- a/src/transformers/tokenization_camembert.py +++ b/src/transformers/tokenization_camembert.py @@ -17,12 +17,11 @@ import os from shutil import copyfile -from typing import List, Optional +from typing import List, Optional, Tuple import sentencepiece as spm from .tokenization_utils import PreTrainedTokenizer -from .tokenization_utils_fast import PreTrainedTokenizerFast from .utils import logging @@ -253,14 +252,6 @@ class CamembertTokenizer(PreTrainedTokenizer): def __setstate__(self, d): self.__dict__ = d - try: - import sentencepiece as spm - except ImportError: - logger.warning( - "You need to install SentencePiece to use CamembertTokenizer: https://github.com/google/sentencepiece" - "pip install sentencepiece" - ) - raise self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(self.vocab_file) @@ -269,208 +260,14 @@ class CamembertTokenizer(PreTrainedTokenizer): out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip() return out_string - def save_vocabulary(self, save_directory): - """ - Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory. - - Args: - save_directory (:obj:`str`): - The directory in which to save the vocabulary. - - Returns: - :obj:`Tuple(str)`: Paths to the files saved. - """ + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: if not os.path.isdir(save_directory): logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) return - out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"]) - - if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): - copyfile(self.vocab_file, out_vocab_file) - - return (out_vocab_file,) - - -class CamembertTokenizerFast(PreTrainedTokenizerFast): - """ - Construct a "fast" CamemBERT tokenizer (backed by HuggingFace's `tokenizers` library). Adapted from - :class:`~transformers.RobertaTokenizer` and :class:`~transformers.XLNetTokenizer`. Based on `SentencePiece - `__. - - This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main - methods. Users should refer to this superclass for more information regarding those methods. - - vocab_file (:obj:`str`): - `SentencePiece `__ file (generally has a `.spm` extension) that - contains the vocabulary necessary to instantiate a tokenizer. - bos_token (:obj:`str`, `optional`, defaults to :obj:`""`): - The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. - - .. note:: - - When building a sequence using special tokens, this is not the token that is used for the beginning - of sequence. The token used is the :obj:`cls_token`. - eos_token (:obj:`str`, `optional`, defaults to :obj:`""`): - The end of sequence token. - - .. note:: - - When building a sequence using special tokens, this is not the token that is used for the end - of sequence. The token used is the :obj:`sep_token`. - sep_token (:obj:`str`, `optional`, defaults to :obj:`""`): - The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences - for sequence classification or for a text and a question for question answering. - It is also used as the last token of a sequence built with special tokens. - cls_token (:obj:`str`, `optional`, defaults to :obj:`""`): - The classifier token which is used when doing sequence classification (classification of the whole - sequence instead of per-token classification). It is the first token of the sequence when built with - special tokens. - unk_token (:obj:`str`, `optional`, defaults to :obj:`""`): - The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this - token instead. - pad_token (:obj:`str`, `optional`, defaults to :obj:`""`): - The token used for padding, for example when batching sequences of different lengths. - mask_token (:obj:`str`, `optional`, defaults to :obj:`""`): - The token used for masking values. This is the token used when training this model with masked language - modeling. This is the token which the model will try to predict. - additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["NOTUSED", "NOTUSED"]`): - Additional special tokens used by the tokenizer. - - Attributes: - sp_model (:obj:`SentencePieceProcessor`): - The `SentencePiece` processor that is used for every conversion (string, tokens and IDs). - """ - - vocab_files_names = VOCAB_FILES_NAMES - pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP - max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - model_input_names = ["attention_mask"] - slow_tokenizer_class = CamembertTokenizer - - def __init__( - self, - vocab_file, - bos_token="", - eos_token="", - sep_token="", - cls_token="", - unk_token="", - pad_token="", - mask_token="", - additional_special_tokens=["NOTUSED", "NOTUSED"], - **kwargs - ): - super().__init__( - vocab_file, - bos_token=bos_token, - eos_token=eos_token, - sep_token=sep_token, - cls_token=cls_token, - unk_token=unk_token, - pad_token=pad_token, - mask_token=mask_token, - additional_special_tokens=additional_special_tokens, - **kwargs, + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] ) - self.vocab_file = vocab_file - - def build_inputs_with_special_tokens( - self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None - ) -> List[int]: - """ - Build model inputs from a sequence or a pair of sequence for sequence classification tasks - by concatenating and adding special tokens. - An CamemBERT sequence has the following format: - - - single sequence: `` X `` - - pair of sequences: `` A B `` - - Args: - token_ids_0 (:obj:`List[int]`): - List of IDs to which the special tokens will be added. - token_ids_1 (:obj:`List[int]`, `optional`): - Optional second list of IDs for sequence pairs. - - Returns: - :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. - """ - - if token_ids_1 is None: - return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] - cls = [self.cls_token_id] - sep = [self.sep_token_id] - return cls + token_ids_0 + sep + sep + token_ids_1 + sep - - def get_special_tokens_mask( - self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False - ) -> List[int]: - """ - Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding - special tokens using the tokenizer ``prepare_for_model`` method. - - Args: - token_ids_0 (:obj:`List[int]`): - List of IDs. - token_ids_1 (:obj:`List[int]`, `optional`): - Optional second list of IDs for sequence pairs. - already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): - Whether or not the token list is already formatted with special tokens for the model. - - Returns: - :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. - """ - if already_has_special_tokens: - if token_ids_1 is not None: - raise ValueError( - "You should not supply a second sequence if the provided sequence of " - "ids is already formated with special tokens for the model." - ) - return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) - - if token_ids_1 is None: - return [1] + ([0] * len(token_ids_0)) + [1] - return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] - - def create_token_type_ids_from_sequences( - self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None - ) -> List[int]: - """ - Create a mask from the two sequences passed to be used in a sequence-pair classification task. - CamemBERT, like RoBERTa, does not make use of token type ids, therefore a list of zeros is returned. - - Args: - token_ids_0 (:obj:`List[int]`): - List of IDs. - token_ids_1 (:obj:`List[int]`, `optional`): - Optional second list of IDs for sequence pairs. - - Returns: - :obj:`List[int]`: List of zeros. - """ - sep = [self.sep_token_id] - cls = [self.cls_token_id] - - if token_ids_1 is None: - return len(cls + token_ids_0 + sep) * [0] - return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] - - def save_vocabulary(self, save_directory): - """ - Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory. - - Args: - save_directory (:obj:`str`): - The directory in which to save the vocabulary. - - Returns: - :obj:`Tuple(str)`: Paths to the files saved. - """ - if not os.path.isdir(save_directory): - logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) - return - out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"]) - if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): copyfile(self.vocab_file, out_vocab_file) diff --git a/src/transformers/tokenization_camembert_fast.py b/src/transformers/tokenization_camembert_fast.py new file mode 100644 index 00000000000..179695aac7a --- /dev/null +++ b/src/transformers/tokenization_camembert_fast.py @@ -0,0 +1,237 @@ +# coding=utf-8 +# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License +""" Fast tokenization classes for Camembert model.""" + + +import os +from shutil import copyfile +from typing import List, Optional, Tuple + +from .file_utils import is_sentencepiece_available +from .tokenization_utils_fast import PreTrainedTokenizerFast +from .utils import logging + + +if is_sentencepiece_available(): + from .tokenization_camembert import CamembertTokenizer +else: + CamembertTokenizer = None + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "tokenizer_file": "tokenizer.json"} + +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": { + "camembert-base": "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-sentencepiece.bpe.model", + }, + "tokenizer_file": { + "camembert-base": "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-tokenizer.json", + }, +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + "camembert-base": 512, +} + +SHARED_MODEL_IDENTIFIERS = [ + # Load with + # `tokenizer = AutoTokenizer.from_pretrained("username/pretrained_model")` + "Musixmatch/umberto-commoncrawl-cased-v1", + "Musixmatch/umberto-wikipedia-uncased-v1", +] + +SPIECE_UNDERLINE = "▁" + + +class CamembertTokenizerFast(PreTrainedTokenizerFast): + """ + Construct a "fast" CamemBERT tokenizer (backed by HuggingFace's `tokenizers` library). Adapted from + :class:`~transformers.RobertaTokenizer` and :class:`~transformers.XLNetTokenizer`. Based on `SentencePiece + `__. + + This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main + methods. Users should refer to this superclass for more information regarding those methods. + + vocab_file (:obj:`str`): + `SentencePiece `__ file (generally has a `.spm` extension) that + contains the vocabulary necessary to instantiate a tokenizer. + bos_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. + + .. note:: + + When building a sequence using special tokens, this is not the token that is used for the beginning + of sequence. The token used is the :obj:`cls_token`. + eos_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The end of sequence token. + + .. note:: + + When building a sequence using special tokens, this is not the token that is used for the end + of sequence. The token used is the :obj:`sep_token`. + sep_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences + for sequence classification or for a text and a question for question answering. + It is also used as the last token of a sequence built with special tokens. + cls_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The classifier token which is used when doing sequence classification (classification of the whole + sequence instead of per-token classification). It is the first token of the sequence when built with + special tokens. + unk_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + pad_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The token used for padding, for example when batching sequences of different lengths. + mask_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["NOTUSED", "NOTUSED"]`): + Additional special tokens used by the tokenizer. + + Attributes: + sp_model (:obj:`SentencePieceProcessor`): + The `SentencePiece` processor that is used for every conversion (string, tokens and IDs). + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + model_input_names = ["attention_mask"] + slow_tokenizer_class = CamembertTokenizer + + def __init__( + self, + vocab_file, + tokenizer_file=None, + bos_token="", + eos_token="", + sep_token="", + cls_token="", + unk_token="", + pad_token="", + mask_token="", + additional_special_tokens=["NOTUSED", "NOTUSED"], + **kwargs + ): + super().__init__( + vocab_file, + tokenizer_file=tokenizer_file, + bos_token=bos_token, + eos_token=eos_token, + sep_token=sep_token, + cls_token=cls_token, + unk_token=unk_token, + pad_token=pad_token, + mask_token=mask_token, + additional_special_tokens=additional_special_tokens, + **kwargs, + ) + + self.vocab_file = vocab_file + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks + by concatenating and adding special tokens. + An CamemBERT sequence has the following format: + + - single sequence: `` X `` + - pair of sequences: `` A B `` + + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (:obj:`List[int]`, `optional`): + Optional second list of IDs for sequence pairs. + + Returns: + :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. + """ + + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + cls = [self.cls_token_id] + sep = [self.sep_token_id] + return cls + token_ids_0 + sep + sep + token_ids_1 + sep + + def get_special_tokens_mask( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False + ) -> List[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer ``prepare_for_model`` method. + + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs. + token_ids_1 (:obj:`List[int]`, `optional`): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + if already_has_special_tokens: + if token_ids_1 is not None: + raise ValueError( + "You should not supply a second sequence if the provided sequence of " + "ids is already formated with special tokens for the model." + ) + return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) + + if token_ids_1 is None: + return [1] + ([0] * len(token_ids_0)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] + + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. + CamemBERT, like RoBERTa, does not make use of token type ids, therefore a list of zeros is returned. + + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs. + token_ids_1 (:obj:`List[int]`, `optional`): + Optional second list of IDs for sequence pairs. + + Returns: + :obj:`List[int]`: List of zeros. + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + if not os.path.isdir(save_directory): + logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): + copyfile(self.vocab_file, out_vocab_file) + + return (out_vocab_file,) diff --git a/src/transformers/tokenization_ctrl.py b/src/transformers/tokenization_ctrl.py index f27cc57d8b7..7c07825fa46 100644 --- a/src/transformers/tokenization_ctrl.py +++ b/src/transformers/tokenization_ctrl.py @@ -17,6 +17,7 @@ import json import os +from typing import Optional, Tuple import regex as re @@ -222,22 +223,16 @@ class CTRLTokenizer(PreTrainedTokenizer): out_string = " ".join(tokens).replace("@@ ", "").strip() return out_string - def save_vocabulary(self, save_directory): - """ - Save the vocabulary and special tokens file to a directory. - - Args: - save_directory (:obj:`str`): - The directory in which to save the vocabulary. - - Returns: - :obj:`Tuple(str)`: Paths to the files saved. - """ + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: if not os.path.isdir(save_directory): logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) return - vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"]) - merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES["merges_file"]) + vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + merge_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"] + ) with open(vocab_file, "w", encoding="utf-8") as f: f.write(json.dumps(self.encoder, ensure_ascii=False)) diff --git a/src/transformers/tokenization_deberta.py b/src/transformers/tokenization_deberta.py index 086b5d506e2..a3a9edd1689 100644 --- a/src/transformers/tokenization_deberta.py +++ b/src/transformers/tokenization_deberta.py @@ -20,6 +20,7 @@ import pathlib import random import unicodedata from functools import lru_cache +from typing import Optional, Tuple from zipfile import ZipFile import tqdm @@ -466,10 +467,15 @@ class GPT2Tokenizer(object): self.count.append(n) return idx - def save_pretrained(self, path: str): + def save_pretrained(self, path: str, filename_prefix: str = None): import torch - torch.save(self.gpt2_encoder, path) + filename = VOCAB_FILES_NAMES[list(VOCAB_FILES_NAMES.keys())[0]] + if filename_prefix is not None: + filename = filename_prefix + "-" + filename + full_path = os.path.join(path, filename) + torch.save(self.gpt2_encoder, full_path) + return (full_path,) class DebertaTokenizer(PreTrainedTokenizer): @@ -653,11 +659,5 @@ class DebertaTokenizer(PreTrainedTokenizer): text = " " + text return (text, kwargs) - def save_vocabulary(self, vocab_path): - """Save the tokenizer vocabulary to a directory or file.""" - if os.path.isdir(vocab_path): - vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["vocab_file"]) - else: - vocab_file = vocab_path - self.gpt2_tokenizer.save_pretrained(vocab_file) - return (vocab_file,) + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + return self.gpt2_tokenizer.save_pretrained(save_directory, filename_prefix=filename_prefix) diff --git a/src/transformers/tokenization_distilbert.py b/src/transformers/tokenization_distilbert.py index 1ab8cf30093..f7136b3e36d 100644 --- a/src/transformers/tokenization_distilbert.py +++ b/src/transformers/tokenization_distilbert.py @@ -14,7 +14,7 @@ # limitations under the License. """Tokenization classes for DistilBERT.""" -from .tokenization_bert import BertTokenizer, BertTokenizerFast +from .tokenization_bert import BertTokenizer from .utils import logging @@ -69,22 +69,3 @@ class DistilBertTokenizer(BertTokenizer): max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION model_input_names = ["attention_mask"] - - -class DistilBertTokenizerFast(BertTokenizerFast): - r""" - Construct a "fast" DistilBERT tokenizer (backed by HuggingFace's `tokenizers` library). - - :class:`~transformers.DistilBertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs - end-to-end tokenization: punctuation splitting and wordpiece. - - Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning - parameters. - """ - - vocab_files_names = VOCAB_FILES_NAMES - pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP - max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION - model_input_names = ["attention_mask"] - slow_tokenizer_class = DistilBertTokenizer diff --git a/src/transformers/tokenization_distilbert_fast.py b/src/transformers/tokenization_distilbert_fast.py new file mode 100644 index 00000000000..5ba84cd10a8 --- /dev/null +++ b/src/transformers/tokenization_distilbert_fast.py @@ -0,0 +1,81 @@ +# coding=utf-8 +# Copyright 2018 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for DistilBERT.""" + +from .tokenization_bert_fast import BertTokenizerFast +from .tokenization_distilbert import DistilBertTokenizer +from .utils import logging + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"} + +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": { + "distilbert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt", + "distilbert-base-uncased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt", + "distilbert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt", + "distilbert-base-cased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt", + "distilbert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-vocab.txt", + "distilbert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt", + }, + "tokenizer_file": { + "distilbert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-tokenizer.json", + "distilbert-base-uncased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-tokenizer.json", + "distilbert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-tokenizer.json", + "distilbert-base-cased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-tokenizer.json", + "distilbert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-tokenizer.json", + "distilbert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-tokenizer.json", + }, +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + "distilbert-base-uncased": 512, + "distilbert-base-uncased-distilled-squad": 512, + "distilbert-base-cased": 512, + "distilbert-base-cased-distilled-squad": 512, + "distilbert-base-german-cased": 512, + "distilbert-base-multilingual-cased": 512, +} + + +PRETRAINED_INIT_CONFIGURATION = { + "distilbert-base-uncased": {"do_lower_case": True}, + "distilbert-base-uncased-distilled-squad": {"do_lower_case": True}, + "distilbert-base-cased": {"do_lower_case": False}, + "distilbert-base-cased-distilled-squad": {"do_lower_case": False}, + "distilbert-base-german-cased": {"do_lower_case": False}, + "distilbert-base-multilingual-cased": {"do_lower_case": False}, +} + + +class DistilBertTokenizerFast(BertTokenizerFast): + r""" + Construct a "fast" DistilBERT tokenizer (backed by HuggingFace's `tokenizers` library). + + :class:`~transformers.DistilBertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs + end-to-end tokenization: punctuation splitting and wordpiece. + + Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning + parameters. + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION + model_input_names = ["attention_mask"] + slow_tokenizer_class = DistilBertTokenizer diff --git a/src/transformers/tokenization_dpr.py b/src/transformers/tokenization_dpr.py index bf40bc53c8d..57d6b7305e9 100644 --- a/src/transformers/tokenization_dpr.py +++ b/src/transformers/tokenization_dpr.py @@ -19,7 +19,7 @@ import collections from typing import List, Optional, Union from .file_utils import add_end_docstrings, add_start_docstrings -from .tokenization_bert import BertTokenizer, BertTokenizerFast +from .tokenization_bert import BertTokenizer from .tokenization_utils_base import BatchEncoding, TensorType from .utils import logging @@ -83,24 +83,6 @@ class DPRContextEncoderTokenizer(BertTokenizer): pretrained_init_configuration = CONTEXT_ENCODER_PRETRAINED_INIT_CONFIGURATION -class DPRContextEncoderTokenizerFast(BertTokenizerFast): - r""" - Construct a "fast" DPRContextEncoder tokenizer (backed by HuggingFace's `tokenizers` library). - - :class:`~transformers.DPRContextEncoderTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and - runs end-to-end tokenization: punctuation splitting and wordpiece. - - Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning - parameters. - """ - - vocab_files_names = VOCAB_FILES_NAMES - pretrained_vocab_files_map = CONTEXT_ENCODER_PRETRAINED_VOCAB_FILES_MAP - max_model_input_sizes = CONTEXT_ENCODER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - pretrained_init_configuration = CONTEXT_ENCODER_PRETRAINED_INIT_CONFIGURATION - slow_tokenizer_class = DPRContextEncoderTokenizer - - class DPRQuestionEncoderTokenizer(BertTokenizer): r""" Constructs a DPRQuestionEncoder tokenizer. @@ -118,24 +100,6 @@ class DPRQuestionEncoderTokenizer(BertTokenizer): pretrained_init_configuration = QUESTION_ENCODER_PRETRAINED_INIT_CONFIGURATION -class DPRQuestionEncoderTokenizerFast(BertTokenizerFast): - r""" - Constructs a "fast" DPRQuestionEncoder tokenizer (backed by HuggingFace's `tokenizers` library). - - :class:`~transformers.DPRQuestionEncoderTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and - runs end-to-end tokenization: punctuation splitting and wordpiece. - - Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning - parameters. - """ - - vocab_files_names = VOCAB_FILES_NAMES - pretrained_vocab_files_map = QUESTION_ENCODER_PRETRAINED_VOCAB_FILES_MAP - max_model_input_sizes = QUESTION_ENCODER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - pretrained_init_configuration = QUESTION_ENCODER_PRETRAINED_INIT_CONFIGURATION - slow_tokenizer_class = DPRQuestionEncoderTokenizer - - DPRSpanPrediction = collections.namedtuple( "DPRSpanPrediction", ["span_score", "relevance_score", "doc_id", "start_index", "end_index", "text"] ) @@ -398,25 +362,3 @@ class DPRReaderTokenizer(CustomDPRReaderTokenizerMixin, BertTokenizer): max_model_input_sizes = READER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES pretrained_init_configuration = READER_PRETRAINED_INIT_CONFIGURATION model_input_names = ["attention_mask"] - - -@add_end_docstrings(CUSTOM_DPR_READER_DOCSTRING) -class DPRReaderTokenizerFast(CustomDPRReaderTokenizerMixin, BertTokenizerFast): - r""" - Constructs a "fast" DPRReader tokenizer (backed by HuggingFace's `tokenizers` library). - - :class:`~transformers.DPRReaderTokenizerFast` is almost identical to :class:`~transformers.BertTokenizerFast` and - runs end-to-end tokenization: punctuation splitting and wordpiece. The difference is that is has three inputs - strings: question, titles and texts that are combined to be fed to the :class:`~transformers.DPRReader` model. - - Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning - parameters. - - """ - - vocab_files_names = VOCAB_FILES_NAMES - pretrained_vocab_files_map = READER_PRETRAINED_VOCAB_FILES_MAP - max_model_input_sizes = READER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - pretrained_init_configuration = READER_PRETRAINED_INIT_CONFIGURATION - model_input_names = ["attention_mask"] - slow_tokenizer_class = DPRReaderTokenizer diff --git a/src/transformers/tokenization_dpr_fast.py b/src/transformers/tokenization_dpr_fast.py new file mode 100644 index 00000000000..5607c1adbdf --- /dev/null +++ b/src/transformers/tokenization_dpr_fast.py @@ -0,0 +1,378 @@ +# coding=utf-8 +# Copyright 2018 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for DPR.""" + + +import collections +from typing import List, Optional, Union + +from .file_utils import add_end_docstrings, add_start_docstrings +from .tokenization_bert_fast import BertTokenizerFast +from .tokenization_dpr import DPRContextEncoderTokenizer, DPRQuestionEncoderTokenizer, DPRReaderTokenizer +from .tokenization_utils_base import BatchEncoding, TensorType +from .utils import logging + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"} + +CONTEXT_ENCODER_PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": { + "facebook/dpr-ctx_encoder-single-nq-base": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt", + }, + "tokenizer_file": { + "facebook/dpr-ctx_encoder-single-nq-base": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-tokenizer.json", + }, +} +QUESTION_ENCODER_PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": { + "facebook/dpr-question_encoder-single-nq-base": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt", + }, + "tokenizer_file": { + "facebook/dpr-question_encoder-single-nq-base": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-tokenizer.json", + }, +} +READER_PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": { + "facebook/dpr-reader-single-nq-base": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt", + }, + "tokenizer_file": { + "facebook/dpr-reader-single-nq-base": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-tokenizer.json", + }, +} + +CONTEXT_ENCODER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + "facebook/dpr-ctx_encoder-single-nq-base": 512, +} +QUESTION_ENCODER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + "facebook/dpr-question_encoder-single-nq-base": 512, +} +READER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + "facebook/dpr-reader-single-nq-base": 512, +} + + +CONTEXT_ENCODER_PRETRAINED_INIT_CONFIGURATION = { + "facebook/dpr-ctx_encoder-single-nq-base": {"do_lower_case": True}, +} +QUESTION_ENCODER_PRETRAINED_INIT_CONFIGURATION = { + "facebook/dpr-question_encoder-single-nq-base": {"do_lower_case": True}, +} +READER_PRETRAINED_INIT_CONFIGURATION = { + "facebook/dpr-reader-single-nq-base": {"do_lower_case": True}, +} + + +class DPRContextEncoderTokenizerFast(BertTokenizerFast): + r""" + Construct a "fast" DPRContextEncoder tokenizer (backed by HuggingFace's `tokenizers` library). + + :class:`~transformers.DPRContextEncoderTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and + runs end-to-end tokenization: punctuation splitting and wordpiece. + + Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning + parameters. + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = CONTEXT_ENCODER_PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = CONTEXT_ENCODER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + pretrained_init_configuration = CONTEXT_ENCODER_PRETRAINED_INIT_CONFIGURATION + slow_tokenizer_class = DPRContextEncoderTokenizer + + +class DPRQuestionEncoderTokenizerFast(BertTokenizerFast): + r""" + Constructs a "fast" DPRQuestionEncoder tokenizer (backed by HuggingFace's `tokenizers` library). + + :class:`~transformers.DPRQuestionEncoderTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and + runs end-to-end tokenization: punctuation splitting and wordpiece. + + Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning + parameters. + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = QUESTION_ENCODER_PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = QUESTION_ENCODER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + pretrained_init_configuration = QUESTION_ENCODER_PRETRAINED_INIT_CONFIGURATION + slow_tokenizer_class = DPRQuestionEncoderTokenizer + + +DPRSpanPrediction = collections.namedtuple( + "DPRSpanPrediction", ["span_score", "relevance_score", "doc_id", "start_index", "end_index", "text"] +) + +DPRReaderOutput = collections.namedtuple("DPRReaderOutput", ["start_logits", "end_logits", "relevance_logits"]) + + +CUSTOM_DPR_READER_DOCSTRING = r""" + Return a dictionary with the token ids of the input strings and other information to give to + :obj:`.decode_best_spans`. + It converts the strings of a question and different passages (title and text) in a sequence of IDs (integers), + using the tokenizer and vocabulary. The resulting :obj:`input_ids` is a matrix of size + :obj:`(n_passages, sequence_length)` with the format: + + [CLS] [SEP] [SEP] + + Args: + questions (:obj:`str` or :obj:`List[str]`): + The questions to be encoded. + You can specify one question for many passages. In this case, the question will be duplicated like + :obj:`[questions] * n_passages`. + Otherwise you have to specify as many questions as in :obj:`titles` or :obj:`texts`. + titles (:obj:`str` or :obj:`List[str]`): + The passages titles to be encoded. This can be a string or a list of strings if there are several passages. + texts (:obj:`str` or :obj:`List[str]`): + The passages texts to be encoded. This can be a string or a list of strings if there are several passages. + padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`): + Activates and controls padding. Accepts the following values: + + * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a + single sequence if provided). + * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the + maximum acceptable input length for the model if that argument is not provided. + * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of + different lengths). + truncation (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.TruncationStrategy`, `optional`, defaults to :obj:`False`): + Activates and controls truncation. Accepts the following values: + + * :obj:`True` or :obj:`'longest_first'`: Truncate to a maximum length specified with the argument + :obj:`max_length` or to the maximum acceptable input length for the model if that argument is not + provided. This will truncate token by token, removing a token from the longest sequence in the pair + if a pair of sequences (or a batch of pairs) is provided. + * :obj:`'only_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to + the maximum acceptable input length for the model if that argument is not provided. This will only + truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided. + * :obj:`'only_second'`: Truncate to a maximum length specified with the argument :obj:`max_length` or + to the maximum acceptable input length for the model if that argument is not provided. This will only + truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided. + * :obj:`False` or :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with + sequence lengths greater than the model maximum admissible input size). + max_length (:obj:`int`, `optional`): + Controls the maximum length to use by one of the truncation/padding parameters. + + If left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum + length is required by one of the truncation/padding parameters. If the model has no specific maximum + input length (like XLNet) truncation/padding to a maximum length will be deactivated. + return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`): + If set, will return tensors instead of list of python integers. Acceptable values are: + + * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects. + * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects. + * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects. + return_attention_mask (:obj:`bool`, `optional`): + Whether or not to return the attention mask. If not set, will return the attention mask according to the + specific tokenizer's default, defined by the :obj:`return_outputs` attribute. + + `What are attention masks? <../glossary.html#attention-mask>`__ + + Return: + :obj:`Dict[str, List[List[int]]]`: A dictionary with the following keys: + + - ``input_ids``: List of token ids to be fed to a model. + - ``attention_mask``: List of indices specifying which tokens should be attended to by the model. + """ + + +@add_start_docstrings(CUSTOM_DPR_READER_DOCSTRING) +class CustomDPRReaderTokenizerMixin: + def __call__( + self, + questions, + titles: Optional[str] = None, + texts: Optional[str] = None, + padding: Union[bool, str] = False, + truncation: Union[bool, str] = False, + max_length: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_attention_mask: Optional[bool] = None, + **kwargs + ) -> BatchEncoding: + if titles is None and texts is None: + return super().__call__( + questions, + padding=padding, + truncation=truncation, + max_length=max_length, + return_tensors=return_tensors, + return_attention_mask=return_attention_mask, + **kwargs, + ) + elif titles is None or texts is None: + text_pair = titles if texts is None else texts + return super().__call__( + questions, + text_pair, + padding=padding, + truncation=truncation, + max_length=max_length, + return_tensors=return_tensors, + return_attention_mask=return_attention_mask, + **kwargs, + ) + titles = titles if not isinstance(titles, str) else [titles] + texts = texts if not isinstance(texts, str) else [texts] + n_passages = len(titles) + questions = questions if not isinstance(questions, str) else [questions] * n_passages + assert len(titles) == len( + texts + ), "There should be as many titles than texts but got {} titles and {} texts.".format(len(titles), len(texts)) + encoded_question_and_titles = super().__call__(questions, titles, padding=False, truncation=False)["input_ids"] + encoded_texts = super().__call__(texts, add_special_tokens=False, padding=False, truncation=False)["input_ids"] + encoded_inputs = { + "input_ids": [ + (encoded_question_and_title + encoded_text)[:max_length] + if max_length is not None and truncation + else encoded_question_and_title + encoded_text + for encoded_question_and_title, encoded_text in zip(encoded_question_and_titles, encoded_texts) + ] + } + if return_attention_mask is not False: + attention_mask = [input_ids != self.pad_token_id for input_ids in encoded_inputs["input_ids"]] + encoded_inputs["attention_mask"] = attention_mask + return self.pad(encoded_inputs, padding=padding, max_length=max_length, return_tensors=return_tensors) + + def decode_best_spans( + self, + reader_input: BatchEncoding, + reader_output: DPRReaderOutput, + num_spans: int = 16, + max_answer_length: int = 64, + num_spans_per_passage: int = 4, + ) -> List[DPRSpanPrediction]: + """ + Get the span predictions for the extractive Q&A model. + Outputs: `List` of `DPRReaderOutput` sorted by descending `(relevance_score, span_score)`. + Each `DPRReaderOutput` is a `Tuple` with: + **span_score**: ``float`` that corresponds to the score given by the reader for this span compared to other spans + in the same passage. It corresponds to the sum of the start and end logits of the span. + **relevance_score**: ``float`` that corresponds to the score of the each passage to answer the question, + compared to all the other passages. It corresponds to the output of the QA classifier of the DPRReader. + **doc_id**: ``int``` the id of the passage. + **start_index**: ``int`` the start index of the span (inclusive). + **end_index**: ``int`` the end index of the span (inclusive). + + Examples:: + + >>> from transformers import DPRReader, DPRReaderTokenizer + >>> tokenizer = DPRReaderTokenizer.from_pretrained('facebook/dpr-reader-single-nq-base') + >>> model = DPRReader.from_pretrained('facebook/dpr-reader-single-nq-base') + >>> encoded_inputs = tokenizer( + ... questions=["What is love ?"], + ... titles=["Haddaway"], + ... texts=["'What Is Love' is a song recorded by the artist Haddaway"], + ... return_tensors='pt' + ... ) + >>> outputs = model(**encoded_inputs) + >>> predicted_spans = tokenizer.decode_best_spans(encoded_inputs, outputs) + >>> print(predicted_spans[0].text) # best span + + """ + input_ids = reader_input["input_ids"] + start_logits, end_logits, relevance_logits = reader_output[:3] + n_passages = len(relevance_logits) + sorted_docs = sorted(range(n_passages), reverse=True, key=relevance_logits.__getitem__) + nbest_spans_predictions: List[DPRReaderOutput] = [] + for doc_id in sorted_docs: + sequence_ids = list(input_ids[doc_id]) + # assuming question & title information is at the beginning of the sequence + passage_offset = sequence_ids.index(self.sep_token_id, 2) + 1 # second sep id + if sequence_ids[-1] == self.pad_token_id: + sequence_len = sequence_ids.index(self.pad_token_id) + else: + sequence_len = len(sequence_ids) + + best_spans = self._get_best_spans( + start_logits=start_logits[doc_id][passage_offset:sequence_len], + end_logits=end_logits[doc_id][passage_offset:sequence_len], + max_answer_length=max_answer_length, + top_spans=num_spans_per_passage, + ) + for start_index, end_index in best_spans: + start_index += passage_offset + end_index += passage_offset + nbest_spans_predictions.append( + DPRSpanPrediction( + span_score=start_logits[doc_id][start_index] + end_logits[doc_id][end_index], + relevance_score=relevance_logits[doc_id], + doc_id=doc_id, + start_index=start_index, + end_index=end_index, + text=self.decode(sequence_ids[start_index : end_index + 1]), + ) + ) + if len(nbest_spans_predictions) >= num_spans: + break + return nbest_spans_predictions[:num_spans] + + def _get_best_spans( + self, + start_logits: List[int], + end_logits: List[int], + max_answer_length: int, + top_spans: int, + ) -> List[DPRSpanPrediction]: + """ + Finds the best answer span for the extractive Q&A model for one passage. + It returns the best span by descending `span_score` order and keeping max `top_spans` spans. + Spans longer that `max_answer_length` are ignored. + """ + scores = [] + for (start_index, start_score) in enumerate(start_logits): + for (answer_length, end_score) in enumerate(end_logits[start_index : start_index + max_answer_length]): + scores.append(((start_index, start_index + answer_length), start_score + end_score)) + scores = sorted(scores, key=lambda x: x[1], reverse=True) + chosen_span_intervals = [] + for (start_index, end_index), score in scores: + assert start_index <= end_index, "Wrong span indices: [{}:{}]".format(start_index, end_index) + length = end_index - start_index + 1 + assert length <= max_answer_length, "Span is too long: {} > {}".format(length, max_answer_length) + if any( + [ + start_index <= prev_start_index <= prev_end_index <= end_index + or prev_start_index <= start_index <= end_index <= prev_end_index + for (prev_start_index, prev_end_index) in chosen_span_intervals + ] + ): + continue + chosen_span_intervals.append((start_index, end_index)) + + if len(chosen_span_intervals) == top_spans: + break + return chosen_span_intervals + + +@add_end_docstrings(CUSTOM_DPR_READER_DOCSTRING) +class DPRReaderTokenizerFast(CustomDPRReaderTokenizerMixin, BertTokenizerFast): + r""" + Constructs a "fast" DPRReader tokenizer (backed by HuggingFace's `tokenizers` library). + + :class:`~transformers.DPRReaderTokenizerFast` is almost identical to :class:`~transformers.BertTokenizerFast` and + runs end-to-end tokenization: punctuation splitting and wordpiece. The difference is that is has three inputs + strings: question, titles and texts that are combined to be fed to the :class:`~transformers.DPRReader` model. + + Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning + parameters. + + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = READER_PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = READER_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + pretrained_init_configuration = READER_PRETRAINED_INIT_CONFIGURATION + model_input_names = ["attention_mask"] + slow_tokenizer_class = DPRReaderTokenizer diff --git a/src/transformers/tokenization_electra.py b/src/transformers/tokenization_electra.py index 30608ae04c3..50a3c1959c1 100644 --- a/src/transformers/tokenization_electra.py +++ b/src/transformers/tokenization_electra.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .tokenization_bert import BertTokenizer, BertTokenizerFast +from .tokenization_bert import BertTokenizer VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} @@ -64,20 +64,3 @@ class ElectraTokenizer(BertTokenizer): pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION - - -class ElectraTokenizerFast(BertTokenizerFast): - r""" - Construct a "fast" ELECTRA tokenizer (backed by HuggingFace's `tokenizers` library). - - :class:`~transformers.ElectraTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs - end-to-end tokenization: punctuation splitting and wordpiece. - - Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning - parameters. - """ - vocab_files_names = VOCAB_FILES_NAMES - pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP - max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION - slow_tokenizer_class = ElectraTokenizer diff --git a/src/transformers/tokenization_electra_fast.py b/src/transformers/tokenization_electra_fast.py new file mode 100644 index 00000000000..3ada305766a --- /dev/null +++ b/src/transformers/tokenization_electra_fast.py @@ -0,0 +1,75 @@ +# coding=utf-8 +# Copyright 2020 The Google AI Team, Stanford University and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .tokenization_bert_fast import BertTokenizerFast +from .tokenization_electra import ElectraTokenizer + + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"} + +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": { + "google/electra-small-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-small-generator/vocab.txt", + "google/electra-base-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-generator/vocab.txt", + "google/electra-large-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-large-generator/vocab.txt", + "google/electra-small-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-small-discriminator/vocab.txt", + "google/electra-base-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-discriminator/vocab.txt", + "google/electra-large-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-large-discriminator/vocab.txt", + }, + "tokenizer_file": { + "google/electra-small-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-small-generator/tokenizer.json", + "google/electra-base-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-generator/tokenizer.json", + "google/electra-large-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-large-generator/tokenizer.json", + "google/electra-small-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-small-discriminator/tokenizer.json", + "google/electra-base-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-discriminator/tokenizer.json", + "google/electra-large-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-large-discriminator/tokenizer.json", + }, +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + "google/electra-small-generator": 512, + "google/electra-base-generator": 512, + "google/electra-large-generator": 512, + "google/electra-small-discriminator": 512, + "google/electra-base-discriminator": 512, + "google/electra-large-discriminator": 512, +} + + +PRETRAINED_INIT_CONFIGURATION = { + "google/electra-small-generator": {"do_lower_case": True}, + "google/electra-base-generator": {"do_lower_case": True}, + "google/electra-large-generator": {"do_lower_case": True}, + "google/electra-small-discriminator": {"do_lower_case": True}, + "google/electra-base-discriminator": {"do_lower_case": True}, + "google/electra-large-discriminator": {"do_lower_case": True}, +} + + +class ElectraTokenizerFast(BertTokenizerFast): + r""" + Construct a "fast" ELECTRA tokenizer (backed by HuggingFace's `tokenizers` library). + + :class:`~transformers.ElectraTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs + end-to-end tokenization: punctuation splitting and wordpiece. + + Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning + parameters. + """ + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION + slow_tokenizer_class = ElectraTokenizer diff --git a/src/transformers/tokenization_fsmt.py b/src/transformers/tokenization_fsmt.py index 05ce582dffa..9d94fd2ff00 100644 --- a/src/transformers/tokenization_fsmt.py +++ b/src/transformers/tokenization_fsmt.py @@ -20,7 +20,7 @@ import logging import os import re import unicodedata -from typing import Dict, List, Optional +from typing import Dict, List, Optional, Tuple import sacremoses as sm @@ -37,9 +37,21 @@ VOCAB_FILES_NAMES = { "merges_file": "merges.txt", } -PRETRAINED_VOCAB_FILES_MAP = {} -PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {} -PRETRAINED_INIT_CONFIGURATION = {} +PRETRAINED_VOCAB_FILES_MAP = { + "src_vocab_file": {"stas/tiny-wmt19-en-de": "https://cdn.huggingface.co/stas/tiny-wmt19-en-de/vocab-src.json"}, + "tgt_vocab_file": {"stas/tiny-wmt19-en-de": "https://cdn.huggingface.co/stas/tiny-wmt19-en-de/vocab-tgt.json"}, + "merges_file": {"stas/tiny-wmt19-en-de": "https://cdn.huggingface.co/stas/tiny-wmt19-en-de/merges.txt"}, +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"stas/tiny-wmt19-en-de": 1024} +PRETRAINED_INIT_CONFIGURATION = { + "stas/tiny-wmt19-en-de": { + "langs": ["en", "de"], + "model_max_length": 1024, + "special_tokens_map_file": None, + "full_tokenizer_file": None, + } +} def get_pairs(word): @@ -494,24 +506,20 @@ class FSMTTokenizer(PreTrainedTokenizer): model_inputs["labels"] = self(tgt_texts, **tokenizer_kwargs)["input_ids"] return model_inputs - def save_vocabulary(self, save_directory): - """ - Save the vocabulary and special tokens file to a directory. - - Args: - vocab_path (:obj:`str`): - The directory in which to save the vocabulary. - - Returns: - :obj:`Tuple(str)`: Paths to the files saved. - """ + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: if not os.path.isdir(save_directory): logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) return - src_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["src_vocab_file"]) - tgt_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["tgt_vocab_file"]) - merges_file = os.path.join(save_directory, VOCAB_FILES_NAMES["merges_file"]) + src_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["src_vocab_file"] + ) + tgt_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["tgt_vocab_file"] + ) + merges_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"] + ) with open(src_vocab_file, "w", encoding="utf-8") as f: f.write(json.dumps(self.encoder, ensure_ascii=False)) diff --git a/src/transformers/tokenization_funnel.py b/src/transformers/tokenization_funnel.py index 48c768f59b9..b9df503384b 100644 --- a/src/transformers/tokenization_funnel.py +++ b/src/transformers/tokenization_funnel.py @@ -16,7 +16,7 @@ from typing import List, Optional -from .tokenization_bert import BertTokenizer, BertTokenizerFast +from .tokenization_bert import BertTokenizer from .utils import logging @@ -135,86 +135,3 @@ class FunnelTokenizer(BertTokenizer): if token_ids_1 is None: return len(cls) * [self.cls_token_type_id] + len(token_ids_0 + sep) * [0] return len(cls) * [self.cls_token_type_id] + len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] - - -class FunnelTokenizerFast(BertTokenizerFast): - r""" - Construct a "fast" Funnel Transformer tokenizer (backed by HuggingFace's `tokenizers` library). - - :class:`~transformers.FunnelTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs - end-to-end tokenization: punctuation splitting and wordpiece. - - Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning - parameters. - """ - - vocab_files_names = VOCAB_FILES_NAMES - pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP - max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION - slow_tokenizer_class = FunnelTokenizer - cls_token_type_id: int = 2 - - def __init__( - self, - vocab_file, - do_lower_case=True, - unk_token="", - sep_token="", - pad_token="", - cls_token="", - mask_token="", - bos_token="", - eos_token="", - clean_text=True, - tokenize_chinese_chars=True, - strip_accents=None, - wordpieces_prefix="##", - **kwargs - ): - super().__init__( - vocab_file, - do_lower_case=do_lower_case, - unk_token=unk_token, - sep_token=sep_token, - pad_token=pad_token, - cls_token=cls_token, - mask_token=mask_token, - bos_token=bos_token, - eos_token=eos_token, - clean_text=clean_text, - tokenize_chinese_chars=tokenize_chinese_chars, - strip_accents=strip_accents, - wordpieces_prefix=wordpieces_prefix, - **kwargs, - ) - - def create_token_type_ids_from_sequences( - self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None - ) -> List[int]: - """ - Create a mask from the two sequences passed to be used in a sequence-pair classification task. - A Funnel Transformer sequence pair mask has the following format: - - :: - - 2 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 - | first sequence | second sequence | - - If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s). - - Args: - token_ids_0 (:obj:`List[int]`): - List of IDs. - token_ids_1 (:obj:`List[int]`, `optional`): - Optional second list of IDs for sequence pairs. - - Returns: - :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given - sequence(s). - """ - sep = [self.sep_token_id] - cls = [self.cls_token_id] - if token_ids_1 is None: - return len(cls) * [self.cls_token_type_id] + len(token_ids_0 + sep) * [0] - return len(cls) * [self.cls_token_type_id] + len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] diff --git a/src/transformers/tokenization_funnel_fast.py b/src/transformers/tokenization_funnel_fast.py new file mode 100644 index 00000000000..29a42459e58 --- /dev/null +++ b/src/transformers/tokenization_funnel_fast.py @@ -0,0 +1,153 @@ +# coding=utf-8 +# Copyright 2020 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Tokenization class for Funnel Transformer.""" + +from typing import List, Optional + +from .tokenization_bert_fast import BertTokenizerFast +from .tokenization_funnel import FunnelTokenizer +from .utils import logging + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"} + +_model_names = [ + "small", + "small-base", + "medium", + "medium-base", + "intermediate", + "intermediate-base", + "large", + "large-base", + "xlarge", + "xlarge-base", +] + +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": { + "funnel-transformer/small": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/small/vocab.txt", + "funnel-transformer/small-base": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/small-base/vocab.txt", + "funnel-transformer/medium": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/medium/vocab.txt", + "funnel-transformer/medium-base": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/medium-base/vocab.txt", + "funnel-transformer/intermediate": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/intermediate/vocab.txt", + "funnel-transformer/intermediate-base": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/intermediate-base/vocab.txt", + "funnel-transformer/large": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/large/vocab.txt", + "funnel-transformer/large-base": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/large-base/vocab.txt", + "funnel-transformer/xlarge": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/xlarge/vocab.txt", + "funnel-transformer/xlarge-base": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/xlarge-base/vocab.txt", + }, + "tokenizer_file": { + "funnel-transformer/small": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/small/tokenizer.json", + "funnel-transformer/small-base": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/small-base/tokenizer.json", + "funnel-transformer/medium": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/medium/tokenizer.json", + "funnel-transformer/medium-base": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/medium-base/tokenizer.json", + "funnel-transformer/intermediate": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/intermediate/tokenizer.json", + "funnel-transformer/intermediate-base": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/intermediate-base/tokenizer.json", + "funnel-transformer/large": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/large/tokenizer.json", + "funnel-transformer/large-base": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/large-base/tokenizer.json", + "funnel-transformer/xlarge": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/xlarge/tokenizer.json", + "funnel-transformer/xlarge-base": "https://s3.amazonaws.com/models.huggingface.co/bert/funnel-transformer/xlarge-base/tokenizer.json", + }, +} +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {f"funnel-transformer/{name}": 512 for name in _model_names} +PRETRAINED_INIT_CONFIGURATION = {f"funnel-transformer/{name}": {"do_lower_case": True} for name in _model_names} + + +class FunnelTokenizerFast(BertTokenizerFast): + r""" + Construct a "fast" Funnel Transformer tokenizer (backed by HuggingFace's `tokenizers` library). + + :class:`~transformers.FunnelTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs + end-to-end tokenization: punctuation splitting and wordpiece. + + Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning + parameters. + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION + slow_tokenizer_class = FunnelTokenizer + cls_token_type_id: int = 2 + + def __init__( + self, + vocab_file, + tokenizer_file=None, + do_lower_case=True, + unk_token="", + sep_token="", + pad_token="", + cls_token="", + mask_token="", + bos_token="", + eos_token="", + clean_text=True, + tokenize_chinese_chars=True, + strip_accents=None, + wordpieces_prefix="##", + **kwargs + ): + super().__init__( + vocab_file, + tokenizer_file=tokenizer_file, + do_lower_case=do_lower_case, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + bos_token=bos_token, + eos_token=eos_token, + clean_text=clean_text, + tokenize_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents, + wordpieces_prefix=wordpieces_prefix, + **kwargs, + ) + + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. + A Funnel Transformer sequence pair mask has the following format: + + :: + + 2 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 + | first sequence | second sequence | + + If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s). + + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs. + token_ids_1 (:obj:`List[int]`, `optional`): + Optional second list of IDs for sequence pairs. + + Returns: + :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given + sequence(s). + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + if token_ids_1 is None: + return len(cls) * [self.cls_token_type_id] + len(token_ids_0 + sep) * [0] + return len(cls) * [self.cls_token_type_id] + len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] diff --git a/src/transformers/tokenization_gpt2.py b/src/transformers/tokenization_gpt2.py index deae5ea66f4..96557330a5e 100644 --- a/src/transformers/tokenization_gpt2.py +++ b/src/transformers/tokenization_gpt2.py @@ -19,12 +19,11 @@ import json import os import warnings from functools import lru_cache +from typing import Optional, Tuple import regex as re from .tokenization_utils import AddedToken, PreTrainedTokenizer -from .tokenization_utils_base import BatchEncoding -from .tokenization_utils_fast import PreTrainedTokenizerFast from .utils import logging @@ -256,22 +255,16 @@ class GPT2Tokenizer(PreTrainedTokenizer): text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors) return text - def save_vocabulary(self, save_directory): - """ - Save the vocabulary and special tokens file to a directory. - - Args: - save_directory (:obj:`str`): - The directory in which to save the vocabulary. - - Returns: - :obj:`Tuple(str)`: Paths to the files saved. - """ + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: if not os.path.isdir(save_directory): logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) return - vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"]) - merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES["merges_file"]) + vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + merge_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"] + ) with open(vocab_file, "w", encoding="utf-8") as f: f.write(json.dumps(self.encoder, ensure_ascii=False)) @@ -303,114 +296,3 @@ class GPT2Tokenizer(PreTrainedTokenizer): if is_split_into_words or add_prefix_space: text = " " + text return (text, kwargs) - - -class GPT2TokenizerFast(PreTrainedTokenizerFast): - """ - Construct a "fast" GPT-2 tokenizer (backed by HuggingFace's `tokenizers` library). Based on byte-level - Byte-Pair-Encoding. - - This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will - be encoded differently whether it is at the beginning of the sentence (without space) or not: - - :: - - >>> from transformers import GPT2TokenizerFast - >>> tokenizer = GPT2TokenizerFast.from_pretrained("gpt2") - >>> tokenizer("Hello world")['input_ids'] - [15496, 995] - >>> tokenizer(" Hello world")['input_ids'] - [18435, 995] - - You can get around that behavior by passing ``add_prefix_space=True`` when instantiating this tokenizer or when you - call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance. - - .. note:: - - When used with ``is_split_into_words=True``, this tokenizer needs to be instantiated with - ``add_prefix_space=True``. - - This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main - methods. Users should refer to this superclass for more information regarding those methods. - - Args: - vocab_file (:obj:`str`): - Path to the vocabulary file. - merges_file (:obj:`str`): - Path to the merges file. - errors (:obj:`str`, `optional`, defaults to :obj:`"replace"`): - Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode - `__ for more information. - unk_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`): - The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this - token instead. - bos_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`): - The beginning of sequence token. - eos_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`): - The end of sequence token. - add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`False`): - Whether or not to add an initial space to the input. This allows to treat the leading word just as any - other word. (GPT2 tokenizer detect beginning of words by the preceding space). - trim_offsets (:obj:`bool`, `optional`, defaults to :obj:`True`): - Whether or not the post-processing step should trim offsets to avoid including whitespaces. - """ - - vocab_files_names = VOCAB_FILES_NAMES - pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP - max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - model_input_names = ["attention_mask"] - slow_tokenizer_class = GPT2Tokenizer - - def __init__( - self, - vocab_file, - merges_file, - unk_token="<|endoftext|>", - bos_token="<|endoftext|>", - eos_token="<|endoftext|>", - add_prefix_space=False, - **kwargs - ): - super().__init__( - vocab_file, - merges_file, - unk_token=unk_token, - bos_token=bos_token, - eos_token=eos_token, - add_prefix_space=add_prefix_space, - **kwargs, - ) - self.add_prefix_space = add_prefix_space - - def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding: - if "is_pretokenized" in kwargs: - warnings.warn( - "`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.", - FutureWarning, - ) - is_split_into_words = kwargs.pop("is_pretokenized") - - is_split_into_words = kwargs.get("is_split_into_words", False) - assert self.add_prefix_space or not is_split_into_words, ( - f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True " - "to use it with pretokenized inputs." - ) - - return super()._batch_encode_plus(*args, **kwargs) - - def _encode_plus(self, *args, **kwargs) -> BatchEncoding: - if "is_pretokenized" in kwargs: - warnings.warn( - "`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.", - FutureWarning, - ) - is_split_into_words = kwargs.pop("is_pretokenized") - else: - is_split_into_words = kwargs.get("is_split_into_words", False) - - assert self.add_prefix_space or not is_split_into_words, ( - f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True " - "to use it with pretokenized inputs." - ) - - return super()._encode_plus(*args, **kwargs) diff --git a/src/transformers/tokenization_gpt2_fast.py b/src/transformers/tokenization_gpt2_fast.py new file mode 100644 index 00000000000..cb4a6dfe308 --- /dev/null +++ b/src/transformers/tokenization_gpt2_fast.py @@ -0,0 +1,188 @@ +# coding=utf-8 +# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for OpenAI GPT.""" + + +import json +import warnings +from typing import Optional, Tuple + +from tokenizers import pre_tokenizers + +from .tokenization_gpt2 import GPT2Tokenizer +from .tokenization_utils_base import BatchEncoding +from .tokenization_utils_fast import PreTrainedTokenizerFast +from .utils import logging + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"} + +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": { + "gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json", + "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-vocab.json", + "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-vocab.json", + "gpt2-xl": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-vocab.json", + "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-vocab.json", + }, + "merges_file": { + "gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt", + "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-merges.txt", + "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-merges.txt", + "gpt2-xl": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-merges.txt", + "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-merges.txt", + }, + "tokenizer_file": { + "gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-tokenizer.json", + "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-tokenizer.json", + "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-tokenizer.json", + "gpt2-xl": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-tokenizer.json", + "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-tokenizer.json", + }, +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + "gpt2": 1024, + "gpt2-medium": 1024, + "gpt2-large": 1024, + "gpt2-xl": 1024, + "distilgpt2": 1024, +} + + +class GPT2TokenizerFast(PreTrainedTokenizerFast): + """ + Construct a "fast" GPT-2 tokenizer (backed by HuggingFace's `tokenizers` library). Based on byte-level + Byte-Pair-Encoding. + + This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will + be encoded differently whether it is at the beginning of the sentence (without space) or not: + + :: + + >>> from transformers import GPT2TokenizerFast + >>> tokenizer = GPT2TokenizerFast.from_pretrained("gpt2") + >>> tokenizer("Hello world")['input_ids'] + [15496, 995] + >>> tokenizer(" Hello world")['input_ids'] + [18435, 995] + + You can get around that behavior by passing ``add_prefix_space=True`` when instantiating this tokenizer or when you + call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance. + + .. note:: + + When used with ``is_split_into_words=True``, this tokenizer needs to be instantiated with + ``add_prefix_space=True``. + + This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main + methods. Users should refer to this superclass for more information regarding those methods. + + Args: + vocab_file (:obj:`str`): + Path to the vocabulary file. + merges_file (:obj:`str`): + Path to the merges file. + errors (:obj:`str`, `optional`, defaults to :obj:`"replace"`): + Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode + `__ for more information. + unk_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + bos_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`): + The beginning of sequence token. + eos_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`): + The end of sequence token. + add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to add an initial space to the input. This allows to treat the leading word just as any + other word. (GPT2 tokenizer detect beginning of words by the preceding space). + trim_offsets (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether or not the post-processing step should trim offsets to avoid including whitespaces. + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + model_input_names = ["attention_mask"] + slow_tokenizer_class = GPT2Tokenizer + + def __init__( + self, + vocab_file, + merges_file, + tokenizer_file=None, + unk_token="<|endoftext|>", + bos_token="<|endoftext|>", + eos_token="<|endoftext|>", + add_prefix_space=False, + **kwargs + ): + super().__init__( + vocab_file, + merges_file, + tokenizer_file=tokenizer_file, + unk_token=unk_token, + bos_token=bos_token, + eos_token=eos_token, + add_prefix_space=add_prefix_space, + **kwargs, + ) + + pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__()) + if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space: + pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type")) + pre_tok_state["add_prefix_space"] = add_prefix_space + self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state) + + self.add_prefix_space = add_prefix_space + + def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding: + if "is_pretokenized" in kwargs: + warnings.warn( + "`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.", + FutureWarning, + ) + is_split_into_words = kwargs.pop("is_pretokenized") + + is_split_into_words = kwargs.get("is_split_into_words", False) + assert self.add_prefix_space or not is_split_into_words, ( + f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True " + "to use it with pretokenized inputs." + ) + + return super()._batch_encode_plus(*args, **kwargs) + + def _encode_plus(self, *args, **kwargs) -> BatchEncoding: + if "is_pretokenized" in kwargs: + warnings.warn( + "`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.", + FutureWarning, + ) + is_split_into_words = kwargs.pop("is_pretokenized") + else: + is_split_into_words = kwargs.get("is_split_into_words", False) + + assert self.add_prefix_space or not is_split_into_words, ( + f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True " + "to use it with pretokenized inputs." + ) + + return super()._encode_plus(*args, **kwargs) + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + files = self._tokenizer.model.save(save_directory, name=filename_prefix) + return tuple(files) diff --git a/src/transformers/tokenization_herbert.py b/src/transformers/tokenization_herbert.py index 8104485f0f9..09ba80665f9 100644 --- a/src/transformers/tokenization_herbert.py +++ b/src/transformers/tokenization_herbert.py @@ -13,10 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import List, Optional - from .tokenization_bert import BasicTokenizer -from .tokenization_utils_fast import PreTrainedTokenizerFast from .tokenization_xlm import XLMTokenizer from .utils import logging @@ -28,6 +25,14 @@ VOCAB_FILES_NAMES = { "merges_file": "merges.txt", } +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": {"allegro/herbert-base-cased": "https://cdn.huggingface.co/allegro/herbert-base-cased/vocab.json"}, + "merges_file": {"allegro/herbert-base-cased": "https://cdn.huggingface.co/allegro/herbert-base-cased/merges.txt"}, +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"allegro/herbert-base-cased": 514} +PRETRAINED_INIT_CONFIGURATION = {} + class HerbertTokenizer(XLMTokenizer): """ @@ -45,6 +50,9 @@ class HerbertTokenizer(XLMTokenizer): """ vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES def __init__(self, **kwargs): @@ -71,127 +79,3 @@ class HerbertTokenizer(XLMTokenizer): split_tokens.extend([t for t in self.bpe(token).split(" ")]) return split_tokens - - -class HerbertTokenizerFast(PreTrainedTokenizerFast): - """ - Construct a "Fast" BPE tokenizer for HerBERT (backed by HuggingFace's `tokenizers` library). - - Peculiarities: - - - uses BERT's pre-tokenizer: BertPreTokenizer splits tokens on spaces, and also on punctuation. - Each occurence of a punctuation character will be treated separately. - - This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users - should refer to the superclass for more information regarding methods. - - Args: - vocab_file (:obj:`str`): - Path to the vocabulary file. - merges_file (:obj:`str`): - Path to the merges file. - """ - - vocab_files_names = VOCAB_FILES_NAMES - slow_tokenizer_class = HerbertTokenizer - - def __init__(self, vocab_file, merges_file, **kwargs): - - kwargs["cls_token"] = "" - kwargs["unk_token"] = "" - kwargs["pad_token"] = "" - kwargs["mask_token"] = "" - kwargs["sep_token"] = "" - - super().__init__( - vocab_file, - merges_file, - **kwargs, - ) - - def build_inputs_with_special_tokens( - self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None - ) -> List[int]: - """ - Build model inputs from a sequence or a pair of sequence for sequence classification tasks - by concatenating and adding special tokens. - An HerBERT, like BERT sequence has the following format: - - - single sequence: `` X `` - - pair of sequences: `` A B `` - - Args: - token_ids_0 (:obj:`List[int]`): - List of IDs to which the special tokens will be added. - token_ids_1 (:obj:`List[int]`, `optional`): - Optional second list of IDs for sequence pairs. - - Returns: - :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. - """ - - cls = [self.cls_token_id] - sep = [self.sep_token_id] - if token_ids_1 is None: - return cls + token_ids_0 + sep - - return cls + token_ids_0 + sep + token_ids_1 + sep - - def get_special_tokens_mask( - self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False - ) -> List[int]: - """ - Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding - special tokens using the tokenizer ``prepare_for_model`` method. - - Args: - token_ids_0 (:obj:`List[int]`): - List of IDs. - token_ids_1 (:obj:`List[int]`, `optional`): - Optional second list of IDs for sequence pairs. - already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): - Whether or not the token list is already formatted with special tokens for the model. - - Returns: - :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. - """ - if already_has_special_tokens: - if token_ids_1 is not None: - raise ValueError( - "You should not supply a second sequence if the provided sequence of " - "ids is already formated with special tokens for the model." - ) - return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) - - if token_ids_1 is None: - return [1] + ([0] * len(token_ids_0)) + [1] - return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] - - def create_token_type_ids_from_sequences( - self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None - ) -> List[int]: - """ - Create a mask from the two sequences passed to be used in a sequence-pair classification task. - HerBERT, like BERT sequence pair mask has the following format: - - :: - - 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 - | first sequence | second sequence | - - Args: - token_ids_0 (:obj:`List[int]`): - List of IDs. - token_ids_1 (:obj:`List[int]`, `optional`): - Optional second list of IDs for sequence pairs. - - Returns: - :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given - sequence(s). - """ - sep = [self.sep_token_id] - cls = [self.cls_token_id] - - if token_ids_1 is None: - return len(cls + token_ids_0 + sep) * [0] - return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] diff --git a/src/transformers/tokenization_herbert_fast.py b/src/transformers/tokenization_herbert_fast.py new file mode 100644 index 00000000000..299d876b1e2 --- /dev/null +++ b/src/transformers/tokenization_herbert_fast.py @@ -0,0 +1,165 @@ +# coding=utf-8 +# Copyright 2020 The Google AI Language Team Authors, Allegro.pl, Facebook Inc. and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List, Optional, Tuple + +from .tokenization_herbert import ( + PRETRAINED_INIT_CONFIGURATION, + PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES, + PRETRAINED_VOCAB_FILES_MAP, + HerbertTokenizer, +) +from .tokenization_utils_fast import PreTrainedTokenizerFast +from .utils import logging + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = { + "vocab_file": "vocab.json", + "merges_file": "merges.txt", +} + + +class HerbertTokenizerFast(PreTrainedTokenizerFast): + """ + Construct a "Fast" BPE tokenizer for HerBERT (backed by HuggingFace's `tokenizers` library). + + Peculiarities: + + - uses BERT's pre-tokenizer: BertPreTokenizer splits tokens on spaces, and also on punctuation. + Each occurence of a punctuation character will be treated separately. + + This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users + should refer to the superclass for more information regarding methods. + + Args: + vocab_file (:obj:`str`): + Path to the vocabulary file. + merges_file (:obj:`str`): + Path to the merges file. + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + slow_tokenizer_class = HerbertTokenizer + + def __init__(self, vocab_file, merges_file, tokenizer_file=None, **kwargs): + + kwargs["cls_token"] = "" + kwargs["unk_token"] = "" + kwargs["pad_token"] = "" + kwargs["mask_token"] = "" + kwargs["sep_token"] = "" + + super().__init__( + vocab_file, + merges_file, + tokenizer_file=tokenizer_file, + **kwargs, + ) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks + by concatenating and adding special tokens. + An HerBERT, like BERT sequence has the following format: + + - single sequence: `` X `` + - pair of sequences: `` A B `` + + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (:obj:`List[int]`, `optional`): + Optional second list of IDs for sequence pairs. + + Returns: + :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. + """ + + cls = [self.cls_token_id] + sep = [self.sep_token_id] + if token_ids_1 is None: + return cls + token_ids_0 + sep + + return cls + token_ids_0 + sep + token_ids_1 + sep + + def get_special_tokens_mask( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False + ) -> List[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer ``prepare_for_model`` method. + + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs. + token_ids_1 (:obj:`List[int]`, `optional`): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + if already_has_special_tokens: + if token_ids_1 is not None: + raise ValueError( + "You should not supply a second sequence if the provided sequence of " + "ids is already formated with special tokens for the model." + ) + return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) + + if token_ids_1 is None: + return [1] + ([0] * len(token_ids_0)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] + + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. + HerBERT, like BERT sequence pair mask has the following format: + + :: + + 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 + | first sequence | second sequence | + + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs. + token_ids_1 (:obj:`List[int]`, `optional`): + Optional second list of IDs for sequence pairs. + + Returns: + :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given + sequence(s). + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + files = self._tokenizer.model.save(save_directory, name=filename_prefix) + return tuple(files) diff --git a/src/transformers/tokenization_layoutlm.py b/src/transformers/tokenization_layoutlm.py index f3a22b5d97d..0a3c3e7bffa 100644 --- a/src/transformers/tokenization_layoutlm.py +++ b/src/transformers/tokenization_layoutlm.py @@ -15,7 +15,7 @@ """ Tokenization class for model LayoutLM.""" -from .tokenization_bert import BertTokenizer, BertTokenizerFast +from .tokenization_bert import BertTokenizer from .utils import logging @@ -58,21 +58,3 @@ class LayoutLMTokenizer(BertTokenizer): pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - - -class LayoutLMTokenizerFast(BertTokenizerFast): - r""" - Constructs a "Fast" LayoutLMTokenizer. - - :class:`~transformers.LayoutLMTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs end-to-end - tokenization: punctuation splitting + wordpiece. - - Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning - parameters. - """ - - vocab_files_names = VOCAB_FILES_NAMES - pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP - max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION - model_input_names = ["attention_mask"] diff --git a/src/transformers/tokenization_layoutlm_fast.py b/src/transformers/tokenization_layoutlm_fast.py new file mode 100644 index 00000000000..53537103761 --- /dev/null +++ b/src/transformers/tokenization_layoutlm_fast.py @@ -0,0 +1,66 @@ +# coding=utf-8 +# Copyright 2018 The Microsoft Research Asia LayoutLM Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Tokenization class for model LayoutLM.""" + + +from .tokenization_bert_fast import BertTokenizerFast +from .tokenization_layoutlm import LayoutLMTokenizer +from .utils import logging + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"} + +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": { + "microsoft/layoutlm-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt", + "microsoft/layoutlm-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt", + }, + "tokenizer_file": { + "microsoft/layoutlm-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-tokenizer.json", + "microsoft/layoutlm-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-tokenizer.json", + }, +} + + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + "microsoft/layoutlm-base-uncased": 512, + "microsoft/layoutlm-large-uncased": 512, +} + + +PRETRAINED_INIT_CONFIGURATION = { + "microsoft/layoutlm-base-uncased": {"do_lower_case": True}, + "microsoft/layoutlm-large-uncased": {"do_lower_case": True}, +} + + +class LayoutLMTokenizerFast(BertTokenizerFast): + r""" + Constructs a "Fast" LayoutLMTokenizer. + + :class:`~transformers.LayoutLMTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs end-to-end + tokenization: punctuation splitting + wordpiece. + + Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning + parameters. + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION + slow_tokenizer_class = LayoutLMTokenizer diff --git a/src/transformers/tokenization_longformer.py b/src/transformers/tokenization_longformer.py index 5c2718e0da3..f6157e472e1 100644 --- a/src/transformers/tokenization_longformer.py +++ b/src/transformers/tokenization_longformer.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .tokenization_roberta import RobertaTokenizer, RobertaTokenizerFast +from .tokenization_roberta import RobertaTokenizer from .utils import logging @@ -54,19 +54,3 @@ class LongformerTokenizer(RobertaTokenizer): "vocab_file": {m: vocab_url for m in _all_longformer_models}, "merges_file": {m: merges_url for m in _all_longformer_models}, } - - -class LongformerTokenizerFast(RobertaTokenizerFast): - r""" - Construct a "fast" Longformer tokenizer (backed by HuggingFace's `tokenizers` library). - - :class:`~transformers.LongformerTokenizerFast` is identical to :class:`~transformers.RobertaTokenizerFast`. Refer - to the superclass for usage examples and documentation concerning parameters. - """ - # merges and vocab same as Roberta - max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - pretrained_vocab_files_map = { - "vocab_file": {m: vocab_url for m in _all_longformer_models}, - "merges_file": {m: merges_url for m in _all_longformer_models}, - } - slow_tokenizer_class = LongformerTokenizer diff --git a/src/transformers/tokenization_longformer_fast.py b/src/transformers/tokenization_longformer_fast.py new file mode 100644 index 00000000000..8e4dff49ed5 --- /dev/null +++ b/src/transformers/tokenization_longformer_fast.py @@ -0,0 +1,60 @@ +# coding=utf-8 +# Copyright 2020 The Allen Institute for AI team and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .tokenization_longformer import LongformerTokenizer +from .tokenization_roberta_fast import RobertaTokenizerFast +from .utils import logging + + +logger = logging.get_logger(__name__) + + +# vocab and merges same as roberta +vocab_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json" +merges_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt" +tokenizer_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-tokenizer.json" +_all_longformer_models = [ + "allenai/longformer-base-4096", + "allenai/longformer-large-4096", + "allenai/longformer-large-4096-finetuned-triviaqa", + "allenai/longformer-base-4096-extra.pos.embd.only", + "allenai/longformer-large-4096-extra.pos.embd.only", +] + + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + "allenai/longformer-base-4096": 4096, + "allenai/longformer-large-4096": 4096, + "allenai/longformer-large-4096-finetuned-triviaqa": 4096, + "allenai/longformer-base-4096-extra.pos.embd.only": 4096, + "allenai/longformer-large-4096-extra.pos.embd.only": 4096, +} + + +class LongformerTokenizerFast(RobertaTokenizerFast): + r""" + Construct a "fast" Longformer tokenizer (backed by HuggingFace's `tokenizers` library). + + :class:`~transformers.LongformerTokenizerFast` is identical to :class:`~transformers.RobertaTokenizerFast`. Refer + to the superclass for usage examples and documentation concerning parameters. + """ + # merges and vocab same as Roberta + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + pretrained_vocab_files_map = { + "vocab_file": {m: vocab_url for m in _all_longformer_models}, + "merges_file": {m: merges_url for m in _all_longformer_models}, + "tokenizer_file": {m: tokenizer_url for m in _all_longformer_models}, + } + slow_tokenizer_class = LongformerTokenizer diff --git a/src/transformers/tokenization_lxmert.py b/src/transformers/tokenization_lxmert.py index 163684d9e90..b8bf3478b6a 100644 --- a/src/transformers/tokenization_lxmert.py +++ b/src/transformers/tokenization_lxmert.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .tokenization_bert import BertTokenizer, BertTokenizerFast +from .tokenization_bert import BertTokenizer #################################################### @@ -63,20 +63,3 @@ class LxmertTokenizer(BertTokenizer): pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION - - -class LxmertTokenizerFast(BertTokenizerFast): - r""" - Construct a "fast" LXMERT tokenizer (backed by HuggingFace's `tokenizers` library). - - :class:`~transformers.LxmertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs - end-to-end tokenization: punctuation splitting and wordpiece. - - Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning - parameters. - """ - vocab_files_names = VOCAB_FILES_NAMES - pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP - max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION - slow_tokenizer_class = LxmertTokenizer diff --git a/src/transformers/tokenization_lxmert_fast.py b/src/transformers/tokenization_lxmert_fast.py new file mode 100644 index 00000000000..e9048414f80 --- /dev/null +++ b/src/transformers/tokenization_lxmert_fast.py @@ -0,0 +1,69 @@ +# coding=utf-8 +# Copyright 2020 The Google AI Team, Stanford University and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .tokenization_bert_fast import BertTokenizerFast +from .tokenization_lxmert import LxmertTokenizer + + +#################################################### +# Mapping from the keyword arguments names of Tokenizer `__init__` +# to file names for serializing Tokenizer instances +#################################################### +VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"} + +#################################################### +# Mapping from the keyword arguments names of Tokenizer `__init__` +# to pretrained vocabulary URL for all the model shortcut names. +#################################################### +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": { + "unc-nlp/lxmert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt", + }, + "tokenizer_file": { + "unc-nlp/lxmert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-tokenizer.json", + }, +} + +#################################################### +# Mapping from model shortcut names to max length of inputs +#################################################### +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + "unc-nlp/lxmert-base-uncased": 512, +} +#################################################### +# Mapping from model shortcut names to a dictionary of additional +# keyword arguments for Tokenizer `__init__`. +# To be used for checkpoint specific configurations. +#################################################### +PRETRAINED_INIT_CONFIGURATION = { + "unc-nlp/lxmert-base-uncased": {"do_lower_case": True}, +} + + +class LxmertTokenizerFast(BertTokenizerFast): + r""" + Construct a "fast" LXMERT tokenizer (backed by HuggingFace's `tokenizers` library). + + :class:`~transformers.LxmertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs + end-to-end tokenization: punctuation splitting and wordpiece. + + Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning + parameters. + """ + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION + slow_tokenizer_class = LxmertTokenizer diff --git a/src/transformers/tokenization_marian.py b/src/transformers/tokenization_marian.py index c8b22695cae..0db025898b0 100644 --- a/src/transformers/tokenization_marian.py +++ b/src/transformers/tokenization_marian.py @@ -18,6 +18,19 @@ vocab_files_names = { "vocab": "vocab.json", "tokenizer_config_file": "tokenizer_config.json", } + +PRETRAINED_VOCAB_FILES_MAP = { + "source_spm": {"Helsinki-NLP/opus-mt-en-de": "https://cdn.huggingface.co/Helsinki-NLP/opus-mt-en-de/source.spm"}, + "target_spm": {"Helsinki-NLP/opus-mt-en-de": "https://cdn.huggingface.co/Helsinki-NLP/opus-mt-en-de/target.spm"}, + "vocab": {"Helsinki-NLP/opus-mt-en-de": "https://cdn.huggingface.co/Helsinki-NLP/opus-mt-en-de/vocab.json"}, + "tokenizer_config_file": { + "Helsinki-NLP/opus-mt-en-de": "https://cdn.huggingface.co/Helsinki-NLP/opus-mt-en-de/tokenizer_config.json" + }, +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"Helsinki-NLP/opus-mt-en-de": 512} +PRETRAINED_INIT_CONFIGURATION = {} + # Example URL https://s3.amazonaws.com/models.huggingface.co/bert/Helsinki-NLP/opus-mt-en-de/vocab.json @@ -63,6 +76,9 @@ class MarianTokenizer(PreTrainedTokenizer): """ vocab_files_names = vocab_files_names + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES model_input_names = ["attention_mask"] language_code_re = re.compile(">>.+<<") # type: re.Pattern @@ -189,27 +205,22 @@ class MarianTokenizer(PreTrainedTokenizer): def vocab_size(self) -> int: return len(self.encoder) - def save_vocabulary(self, save_directory: str) -> Tuple[str]: - """ - Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory. - - Args: - save_directory (:obj:`str`): - The directory in which to save the vocabulary. - - Returns: - :obj:`Tuple(str)`: Paths to the files saved. - """ + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: save_dir = Path(save_directory) assert save_dir.is_dir(), f"{save_directory} should be a directory" - save_json(self.encoder, save_dir / self.vocab_files_names["vocab"]) + save_json( + self.encoder, + save_dir / ((filename_prefix + "-" if filename_prefix else "") + self.vocab_files_names["vocab"]), + ) for orig, f in zip(["source.spm", "target.spm"], self.spm_files): - dest_path = save_dir / Path(f).name + dest_path = save_dir / ((filename_prefix + "-" if filename_prefix else "") + Path(f).name) if not dest_path.exists(): copyfile(f, save_dir / orig) - return tuple(save_dir / f for f in self.vocab_files_names) + return tuple( + save_dir / ((filename_prefix + "-" if filename_prefix else "") + f) for f in self.vocab_files_names + ) def get_vocab(self) -> Dict: vocab = self.encoder.copy() diff --git a/src/transformers/tokenization_mbart.py b/src/transformers/tokenization_mbart.py index 2f13279fe24..916f95c9aa6 100644 --- a/src/transformers/tokenization_mbart.py +++ b/src/transformers/tokenization_mbart.py @@ -15,12 +15,10 @@ from typing import List, Optional -from tokenizers import processors - from .file_utils import add_start_docstrings from .tokenization_utils import BatchEncoding from .tokenization_utils_base import PREPARE_SEQ2SEQ_BATCH_DOCSTRING -from .tokenization_xlm_roberta import XLMRobertaTokenizer, XLMRobertaTokenizerFast +from .tokenization_xlm_roberta import XLMRobertaTokenizer from .utils import logging @@ -95,8 +93,8 @@ class MBartTokenizer(XLMRobertaTokenizer): prefix_tokens: List[int] = [] suffix_tokens: List[int] = [] - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) + def __init__(self, *args, tokenizer_file=None, **kwargs): + super().__init__(*args, tokenizer_file=tokenizer_file, **kwargs) self.sp_model_size = len(self.sp_model) self.lang_code_to_id = { @@ -233,185 +231,3 @@ class MBartTokenizer(XLMRobertaTokenizer): self.cur_lang_code = self.lang_code_to_id[lang] self.prefix_tokens = [] self.suffix_tokens = [self.eos_token_id, self.cur_lang_code] - - -class MBartTokenizerFast(XLMRobertaTokenizerFast): - """ - Construct a "fast" MBART tokenizer (backed by HuggingFace's `tokenizers` library). - - :class:`~transformers.MBartTokenizerFast` is a subclass of :class:`~transformers.XLMRobertaTokenizerFast` and adds - a new :meth:`~transformers.MBartTokenizerFast.prepare_seq2seq_batch`. - - Refer to superclass :class:`~transformers.XLMRobertaTokenizerFast` for usage examples and documentation concerning - the initialization parameters and other methods. - - .. warning:: - ``prepare_seq2seq_batch`` should be used to encode inputs. Other tokenizer methods like ``encode`` do not work - properly. - - The tokenization method is `` `` for source language documents, and - `` ``` for target language documents. - - Examples:: - - >>> from transformers import MBartTokenizerFast - >>> tokenizer = MBartTokenizerFast.from_pretrained('facebook/mbart-large-en-ro') - >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria" - >>> expected_translation_romanian = "Şeful ONU declară că nu există o soluÅ£ie militară în Siria" - >>> batch: dict = tokenizer.prepare_seq2seq_batch( - ... example_english_phrase, src_lang="en_XX", tgt_lang="ro_RO", tgt_texts=expected_translation_romanian - ... ) - """ - - vocab_files_names = {"vocab_file": "sentencepiece.bpe.model"} - max_model_input_sizes = {m: 1024 for m in _all_mbart_models} - pretrained_vocab_files_map = {"vocab_file": {m: SPM_URL for m in _all_mbart_models}} - slow_tokenizer_class = MBartTokenizer - - prefix_tokens: List[int] = [] - suffix_tokens: List[int] = [] - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - self.cur_lang_code = self.convert_tokens_to_ids("en_XX") - self.set_src_lang_special_tokens(kwargs.get("src_lang", "en_XX")) - - self.add_special_tokens({"additional_special_tokens": FAIRSEQ_LANGUAGE_CODES}) - - def get_special_tokens_mask( - self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False - ) -> List[int]: - """ - Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding - special tokens using the tokenizer ``prepare_for_model`` method. - - Args: - token_ids_0 (:obj:`List[int]`): - List of ids. - token_ids_1 (:obj:`List[int]`, `optional`): - Optional second list of IDs for sequence pairs. - already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): - Whether or not the token list is already formatted with special tokens for the model. - - Returns: - :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. - """ - - if already_has_special_tokens: - if token_ids_1 is not None: - raise ValueError( - "You should not supply a second sequence if the provided sequence of " - "ids is already formated with special tokens for the model." - ) - return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) - prefix_ones = [1] * len(self.prefix_tokens) - suffix_ones = [1] * len(self.suffix_tokens) - if token_ids_1 is None: - return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones - return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones - - def build_inputs_with_special_tokens( - self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None - ) -> List[int]: - """ - Build model inputs from a sequence or a pair of sequence for sequence classification tasks - by concatenating and adding special tokens. The special tokens depend on calling set_lang. - - An MBART sequence has the following format, where ``X`` represents the sequence: - - - ``input_ids`` (for encoder) ``X [eos, src_lang_code]`` - - ``decoder_input_ids``: (for decoder) ``[tgt_lang_code] X [eos]`` - - BOS is never used. - Pairs of sequences are not the expected use case, but they will be handled without a separator. - - Args: - token_ids_0 (:obj:`List[int]`): - List of IDs to which the special tokens will be added. - token_ids_1 (:obj:`List[int]`, `optional`): - Optional second list of IDs for sequence pairs. - - Returns: - :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. - """ - if token_ids_1 is None: - return self.prefix_tokens + token_ids_0 + self.suffix_tokens - # We don't expect to process pairs, but leave the pair logic for API consistency - return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens - - @add_start_docstrings(PREPARE_SEQ2SEQ_BATCH_DOCSTRING) - def prepare_seq2seq_batch( - self, - src_texts: List[str], - src_lang: str = "en_XX", - tgt_texts: Optional[List[str]] = None, - tgt_lang: str = "ro_RO", - max_length: Optional[int] = None, - max_target_length: Optional[int] = None, - truncation: bool = True, - padding: str = "longest", - return_tensors: str = "pt", - **kwargs, - ) -> BatchEncoding: - if max_length is None: - max_length = self.max_len - self.set_src_lang_special_tokens(src_lang) - model_inputs: BatchEncoding = self( - src_texts, - add_special_tokens=True, - return_tensors=return_tensors, - max_length=max_length, - padding=padding, - truncation=truncation, - **kwargs, - ) - if tgt_texts is None: - return model_inputs - # Process tgt_texts - if max_target_length is None: - max_target_length = max_length - self.set_tgt_lang_special_tokens(tgt_lang) - - labels = self( - tgt_texts, - add_special_tokens=True, - return_tensors=return_tensors, - padding=padding, - max_length=max_target_length, - truncation=True, - **kwargs, - )["input_ids"] - model_inputs["labels"] = labels - self.set_src_lang_special_tokens(src_lang) # sets to src_lang - return model_inputs - - def set_src_lang_special_tokens(self, src_lang) -> None: - """Reset the special tokens to the source lang setting. No prefix and suffix=[eos, cur_lang_code].""" - self.cur_lang_code = self.convert_tokens_to_ids(src_lang) - self.prefix_tokens = [] - self.suffix_tokens = [self.eos_token_id, self.cur_lang_code] - - prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens) - suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens) - - self._tokenizer.post_processor = processors.TemplateProcessing( - single=prefix_tokens_str + ["$A"] + suffix_tokens_str, - pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str, - special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)), - ) - - def set_tgt_lang_special_tokens(self, lang: str) -> None: - """Reset the special tokens to the target language setting. Prefix [tgt_lang_code], suffix =[eos].""" - self.cur_lang_code = self.convert_tokens_to_ids(lang) - self.prefix_tokens = [] - self.suffix_tokens = [self.eos_token_id, self.cur_lang_code] - - prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens) - suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens) - - self._tokenizer.post_processor = processors.TemplateProcessing( - single=prefix_tokens_str + ["$A"] + suffix_tokens_str, - pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str, - special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)), - ) diff --git a/src/transformers/tokenization_mbart_fast.py b/src/transformers/tokenization_mbart_fast.py new file mode 100644 index 00000000000..5ed2cbd7efc --- /dev/null +++ b/src/transformers/tokenization_mbart_fast.py @@ -0,0 +1,247 @@ +# coding=utf-8 +# Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List, Optional + +from tokenizers import processors + +from .file_utils import add_start_docstrings, is_sentencepiece_available +from .tokenization_utils import BatchEncoding +from .tokenization_utils_base import PREPARE_SEQ2SEQ_BATCH_DOCSTRING +from .tokenization_xlm_roberta_fast import XLMRobertaTokenizerFast +from .utils import logging + + +if is_sentencepiece_available(): + from .tokenization_mbart import MBartTokenizer +else: + MBartTokenizer = None + + +logger = logging.get_logger(__name__) + +_all_mbart_models = ["facebook/mbart-large-en-ro", "facebook/mbart-large-cc25"] +SPM_URL = "https://s3.amazonaws.com/models.huggingface.co/bert/facebook/mbart-large-en-ro/sentence.bpe.model" +tokenizer_URL = "https://s3.amazonaws.com/models.huggingface.co/bert/facebook/mbart-large-en-ro/tokenizer.json" + +FAIRSEQ_LANGUAGE_CODES = [ + "ar_AR", + "cs_CZ", + "de_DE", + "en_XX", + "es_XX", + "et_EE", + "fi_FI", + "fr_XX", + "gu_IN", + "hi_IN", + "it_IT", + "ja_XX", + "kk_KZ", + "ko_KR", + "lt_LT", + "lv_LV", + "my_MM", + "ne_NP", + "nl_XX", + "ro_RO", + "ru_RU", + "si_LK", + "tr_TR", + "vi_VN", + "zh_CN", +] + + +class MBartTokenizerFast(XLMRobertaTokenizerFast): + """ + Construct a "fast" MBART tokenizer (backed by HuggingFace's `tokenizers` library). + + :class:`~transformers.MBartTokenizerFast` is a subclass of :class:`~transformers.XLMRobertaTokenizerFast` and adds + a new :meth:`~transformers.MBartTokenizerFast.prepare_seq2seq_batch`. + + Refer to superclass :class:`~transformers.XLMRobertaTokenizerFast` for usage examples and documentation concerning + the initialization parameters and other methods. + + .. warning:: + ``prepare_seq2seq_batch`` should be used to encode inputs. Other tokenizer methods like ``encode`` do not work + properly. + + The tokenization method is `` `` for source language documents, and + `` ``` for target language documents. + + Examples:: + + >>> from transformers import MBartTokenizerFast + >>> tokenizer = MBartTokenizerFast.from_pretrained('facebook/mbart-large-en-ro') + >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria" + >>> expected_translation_romanian = "Şeful ONU declară că nu există o soluÅ£ie militară în Siria" + >>> batch: dict = tokenizer.prepare_seq2seq_batch( + ... example_english_phrase, src_lang="en_XX", tgt_lang="ro_RO", tgt_texts=expected_translation_romanian + ... ) + """ + + vocab_files_names = {"vocab_file": "sentencepiece.bpe.model"} + max_model_input_sizes = {m: 1024 for m in _all_mbart_models} + pretrained_vocab_files_map = {"vocab_file": {m: SPM_URL for m in _all_mbart_models}} + slow_tokenizer_class = MBartTokenizer + + prefix_tokens: List[int] = [] + suffix_tokens: List[int] = [] + + def __init__(self, *args, tokenizer_file=None, **kwargs): + super().__init__(*args, tokenizer_file=tokenizer_file, **kwargs) + + self.cur_lang_code = self.convert_tokens_to_ids("en_XX") + self.set_src_lang_special_tokens(kwargs.get("src_lang", "en_XX")) + + self.add_special_tokens({"additional_special_tokens": FAIRSEQ_LANGUAGE_CODES}) + + def get_special_tokens_mask( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False + ) -> List[int]: + """ + Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer ``prepare_for_model`` method. + + Args: + token_ids_0 (:obj:`List[int]`): + List of ids. + token_ids_1 (:obj:`List[int]`, `optional`): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + + if already_has_special_tokens: + if token_ids_1 is not None: + raise ValueError( + "You should not supply a second sequence if the provided sequence of " + "ids is already formated with special tokens for the model." + ) + return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) + prefix_ones = [1] * len(self.prefix_tokens) + suffix_ones = [1] * len(self.suffix_tokens) + if token_ids_1 is None: + return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones + return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks + by concatenating and adding special tokens. The special tokens depend on calling set_lang. + + An MBART sequence has the following format, where ``X`` represents the sequence: + + - ``input_ids`` (for encoder) ``X [eos, src_lang_code]`` + - ``decoder_input_ids``: (for decoder) ``[tgt_lang_code] X [eos]`` + + BOS is never used. + Pairs of sequences are not the expected use case, but they will be handled without a separator. + + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (:obj:`List[int]`, `optional`): + Optional second list of IDs for sequence pairs. + + Returns: + :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. + """ + if token_ids_1 is None: + return self.prefix_tokens + token_ids_0 + self.suffix_tokens + # We don't expect to process pairs, but leave the pair logic for API consistency + return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens + + @add_start_docstrings(PREPARE_SEQ2SEQ_BATCH_DOCSTRING) + def prepare_seq2seq_batch( + self, + src_texts: List[str], + src_lang: str = "en_XX", + tgt_texts: Optional[List[str]] = None, + tgt_lang: str = "ro_RO", + max_length: Optional[int] = None, + max_target_length: Optional[int] = None, + truncation: bool = True, + padding: str = "longest", + return_tensors: str = "pt", + **kwargs, + ) -> BatchEncoding: + if max_length is None: + max_length = self.max_len + self.set_src_lang_special_tokens(src_lang) + model_inputs: BatchEncoding = self( + src_texts, + add_special_tokens=True, + return_tensors=return_tensors, + max_length=max_length, + padding=padding, + truncation=truncation, + **kwargs, + ) + if tgt_texts is None: + return model_inputs + # Process tgt_texts + if max_target_length is None: + max_target_length = max_length + self.set_tgt_lang_special_tokens(tgt_lang) + + labels = self( + tgt_texts, + add_special_tokens=True, + return_tensors=return_tensors, + padding=padding, + max_length=max_target_length, + truncation=True, + **kwargs, + )["input_ids"] + model_inputs["labels"] = labels + self.set_src_lang_special_tokens(src_lang) # sets to src_lang + return model_inputs + + def set_src_lang_special_tokens(self, src_lang) -> None: + """Reset the special tokens to the source lang setting. No prefix and suffix=[eos, cur_lang_code].""" + self.cur_lang_code = self.convert_tokens_to_ids(src_lang) + self.prefix_tokens = [] + self.suffix_tokens = [self.eos_token_id, self.cur_lang_code] + + prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens) + suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens) + + self._tokenizer.post_processor = processors.TemplateProcessing( + single=prefix_tokens_str + ["$A"] + suffix_tokens_str, + pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str, + special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)), + ) + + def set_tgt_lang_special_tokens(self, lang: str) -> None: + """Reset the special tokens to the target language setting. Prefix [tgt_lang_code], suffix =[eos].""" + self.cur_lang_code = self.convert_tokens_to_ids(lang) + self.prefix_tokens = [] + self.suffix_tokens = [self.eos_token_id, self.cur_lang_code] + + prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens) + suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens) + + self._tokenizer.post_processor = processors.TemplateProcessing( + single=prefix_tokens_str + ["$A"] + suffix_tokens_str, + pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str, + special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)), + ) diff --git a/src/transformers/tokenization_mobilebert.py b/src/transformers/tokenization_mobilebert.py index 44874b8c234..90cc6f45b04 100644 --- a/src/transformers/tokenization_mobilebert.py +++ b/src/transformers/tokenization_mobilebert.py @@ -13,7 +13,7 @@ # limitations under the License. """Tokenization classes for MobileBERT.""" -from .tokenization_bert import BertTokenizer, BertTokenizerFast +from .tokenization_bert import BertTokenizer from .utils import logging @@ -27,7 +27,7 @@ PRETRAINED_VOCAB_FILES_MAP = { } } -PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {} +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"mobilebert-uncased": 512} PRETRAINED_INIT_CONFIGURATION = {} @@ -48,21 +48,3 @@ class MobileBertTokenizer(BertTokenizer): pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION - - -class MobileBertTokenizerFast(BertTokenizerFast): - r""" - Construct a "fast" MobileBERT tokenizer (backed by HuggingFace's `tokenizers` library). - - :class:`~transformers.MobileBertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs - end-to-end tokenization: punctuation splitting and wordpiece. - - Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning - parameters. - """ - - vocab_files_names = VOCAB_FILES_NAMES - pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP - max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION - slow_tokenizer_class = MobileBertTokenizer diff --git a/src/transformers/tokenization_mobilebert_fast.py b/src/transformers/tokenization_mobilebert_fast.py new file mode 100644 index 00000000000..63dca583684 --- /dev/null +++ b/src/transformers/tokenization_mobilebert_fast.py @@ -0,0 +1,55 @@ +# coding=utf-8 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for MobileBERT.""" + +from .tokenization_bert_fast import BertTokenizerFast +from .tokenization_mobilebert import MobileBertTokenizer +from .utils import logging + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"} + +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": { + "mobilebert-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/google/mobilebert-uncased/vocab.txt" + }, + "tokenizer_file": { + "mobilebert-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/google/mobilebert-uncased/tokenizer.json" + }, +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"mobilebert-uncased": 512} + + +PRETRAINED_INIT_CONFIGURATION = {} + + +class MobileBertTokenizerFast(BertTokenizerFast): + r""" + Construct a "fast" MobileBERT tokenizer (backed by HuggingFace's `tokenizers` library). + + :class:`~transformers.MobileBertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs + end-to-end tokenization: punctuation splitting and wordpiece. + + Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning + parameters. + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION + slow_tokenizer_class = MobileBertTokenizer diff --git a/src/transformers/tokenization_openai.py b/src/transformers/tokenization_openai.py index d03ecfb3d00..b3ebc1a18c2 100644 --- a/src/transformers/tokenization_openai.py +++ b/src/transformers/tokenization_openai.py @@ -18,10 +18,10 @@ import json import os import re +from typing import Optional, Tuple from .tokenization_bert import BasicTokenizer from .tokenization_utils import PreTrainedTokenizer -from .tokenization_utils_fast import PreTrainedTokenizerFast from .utils import logging @@ -204,22 +204,16 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer): out_string = "".join(tokens).replace("", " ").strip() return out_string - def save_vocabulary(self, save_directory): - """ - Save the vocabulary and special tokens file to a directory. - - Args: - vocab_path (:obj:`str`): - The directory in which to save the vocabulary. - - Returns: - :obj:`Tuple(str)`: Paths to the files saved. - """ + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: if not os.path.isdir(save_directory): logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) return - vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"]) - merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES["merges_file"]) + vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + merge_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"] + ) with open(vocab_file, "w", encoding="utf-8") as f: f.write(json.dumps(self.encoder, ensure_ascii=False)) @@ -238,38 +232,3 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer): index += 1 return vocab_file, merge_file - - -class OpenAIGPTTokenizerFast(PreTrainedTokenizerFast): - """ - Construct a "fast" GPT Tokenizer (backed by HuggingFace's `tokenizers` library). Based on Byte-Pair-Encoding with - the following peculiarities: - - - lower case all inputs - - uses BERT's BasicTokenizer for pre-BPE tokenization - - This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main - methods. Users should refer to this superclass for more information regarding those methods. - - Args: - vocab_file (:obj:`str`): - Path to the vocabulary file. - merges_file (:obj:`str`): - Path to the merges file. - unk_token (:obj:`str`, `optional`, defaults to :obj:`""`): - The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this - token instead. - """ - - vocab_files_names = VOCAB_FILES_NAMES - pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP - max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - model_input_names = ["attention_mask"] - slow_tokenizer_class = OpenAIGPTTokenizer - - def __init__(self, vocab_file, merges_file, unk_token="", **kwargs): - super().__init__(vocab_file, merges_file, unk_token=unk_token, **kwargs) - - @property - def do_lower_case(self): - return True diff --git a/src/transformers/tokenization_openai_fast.py b/src/transformers/tokenization_openai_fast.py new file mode 100644 index 00000000000..8b18b3d1d33 --- /dev/null +++ b/src/transformers/tokenization_openai_fast.py @@ -0,0 +1,76 @@ +# coding=utf-8 +# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Fast Tokenization classes for OpenAI GPT.""" + + +from typing import Optional, Tuple + +from .tokenization_openai import OpenAIGPTTokenizer +from .tokenization_utils_fast import PreTrainedTokenizerFast +from .utils import logging + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"} + +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json"}, + "merges_file": {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt"}, + "tokenizer_file": {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-tokenizer.json"}, +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + "openai-gpt": 512, +} + + +class OpenAIGPTTokenizerFast(PreTrainedTokenizerFast): + """ + Construct a "fast" GPT Tokenizer (backed by HuggingFace's `tokenizers` library). Based on Byte-Pair-Encoding with + the following peculiarities: + + - lower case all inputs + - uses BERT's BasicTokenizer for pre-BPE tokenization + + This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main + methods. Users should refer to this superclass for more information regarding those methods. + + Args: + vocab_file (:obj:`str`): + Path to the vocabulary file. + merges_file (:obj:`str`): + Path to the merges file. + unk_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + model_input_names = ["attention_mask"] + slow_tokenizer_class = OpenAIGPTTokenizer + + def __init__(self, vocab_file, merges_file, tokenizer_file=None, unk_token="", **kwargs): + super().__init__(vocab_file, merges_file, tokenizer_file=tokenizer_file, unk_token=unk_token, **kwargs) + + @property + def do_lower_case(self): + return True + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + files = self._tokenizer.model.save(save_directory, name=filename_prefix) + return tuple(files) diff --git a/src/transformers/tokenization_pegasus.py b/src/transformers/tokenization_pegasus.py index ba92a068d4b..f261149ecd0 100644 --- a/src/transformers/tokenization_pegasus.py +++ b/src/transformers/tokenization_pegasus.py @@ -15,7 +15,7 @@ from typing import Dict, List, Optional from .file_utils import add_start_docstrings -from .tokenization_reformer import ReformerTokenizer, ReformerTokenizerFast +from .tokenization_reformer import ReformerTokenizer from .tokenization_utils_base import PREPARE_SEQ2SEQ_BATCH_DOCSTRING, BatchEncoding @@ -43,7 +43,7 @@ class PegasusTokenizer(ReformerTokenizer): the initialization parameters and other methods. """ offset = 103 # entries 2-104 are only used for pretraining - vocab_files_names = {"vocab_file": "spiece.model"} + vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES @@ -157,78 +157,3 @@ class PegasusTokenizer(ReformerTokenizer): labels: BatchEncoding = self(tgt_texts, **tokenizer_kwargs)["input_ids"] model_inputs["labels"] = labels return model_inputs - - -class PegasusTokenizerFast(ReformerTokenizerFast): - offset = 103 # entries 2-104 are only used for pretraining - vocab_files_names = {"vocab_file": "spiece.model"} - pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP - max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - slow_tokenizer_class = PegasusTokenizer - - def _special_token_mask(self, seq): - all_special_ids = set(self.all_special_ids) # call it once instead of inside list comp - all_special_ids.remove(self.unk_token_id) # is only sometimes special - assert all_special_ids == set([0, 1]) - return [1 if x in all_special_ids else 0 for x in seq] - - def get_special_tokens_mask( - self, token_ids_0: List, token_ids_1: Optional[List] = None, already_has_special_tokens: bool = False - ) -> List[int]: - """Get list where entries are [1] if a token is [eos] or [pad] else 0.""" - if already_has_special_tokens: - return self._special_token_mask(token_ids_0) - elif token_ids_1 is None: - return self._special_token_mask(token_ids_0) + [1] - else: - return self._special_token_mask(token_ids_0 + token_ids_1) + [1] - - def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> List[int]: - """ - Build model inputs from a sequence by adding eos to the end. no bos token is added to the front. - - single sequence: ``X `` - - pair of sequences: ``A B `` (not intended use) - - Args: - token_ids_0 (:obj:`List[int]`): - List of IDs to which the special tokens will be added - token_ids_1 (:obj:`List[int]`, `optional`): - Optional second list of IDs for sequence pairs. - - Returns: - :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. - """ - if token_ids_1 is None: - return token_ids_0 + [self.eos_token_id] - # We don't expect to process pairs, but leave the pair logic for API consistency - return token_ids_0 + token_ids_1 + [self.eos_token_id] - - @add_start_docstrings(PREPARE_SEQ2SEQ_BATCH_DOCSTRING) - def prepare_seq2seq_batch( - self, - src_texts: List[str], - tgt_texts: Optional[List[str]] = None, - max_length: Optional[int] = None, - max_target_length: Optional[int] = None, - return_tensors: str = "pt", - truncation=True, - padding="longest", - **unused, - ) -> BatchEncoding: - if "" in src_texts: - raise ValueError(f"found empty string in src_texts: {src_texts}") - tokenizer_kwargs = dict( - add_special_tokens=True, - return_tensors=return_tensors, - max_length=max_length, - truncation=truncation, - padding=padding, - ) - model_inputs: BatchEncoding = self(src_texts, **tokenizer_kwargs) - if tgt_texts is None: - return model_inputs - if max_target_length is not None: - tokenizer_kwargs["max_length"] = max_target_length - labels: BatchEncoding = self(tgt_texts, **tokenizer_kwargs)["input_ids"] - model_inputs["labels"] = labels - return model_inputs diff --git a/src/transformers/tokenization_pegasus_fast.py b/src/transformers/tokenization_pegasus_fast.py new file mode 100644 index 00000000000..69abf9015fd --- /dev/null +++ b/src/transformers/tokenization_pegasus_fast.py @@ -0,0 +1,118 @@ +# coding=utf-8 +# Copyright 2020 Google and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import List, Optional + +from .file_utils import add_start_docstrings, is_sentencepiece_available +from .tokenization_reformer_fast import ReformerTokenizerFast +from .tokenization_utils_base import PREPARE_SEQ2SEQ_BATCH_DOCSTRING, BatchEncoding + + +if is_sentencepiece_available(): + from .tokenization_pegasus import PegasusTokenizer +else: + PegasusTokenizer = None + + +SPIECE_UNDERLINE = "▁" + +VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"} + +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": {"google/pegasus-xsum": "https://cdn.huggingface.co/google/pegasus-xsum/spiece.model"}, + "tokenizer_file": {"google/pegasus-xsum": "https://cdn.huggingface.co/google/pegasus-xsum/tokenizer.json"}, +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + "google/pegasus-xsum": 512, +} + + +class PegasusTokenizerFast(ReformerTokenizerFast): + offset = 103 # entries 2-104 are only used for pretraining + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + slow_tokenizer_class = PegasusTokenizer + + # def num_special_tokens_to_add(self, pair=False): + # """Just EOS""" + # return 1 + + def _special_token_mask(self, seq): + all_special_ids = set(self.all_special_ids) # call it once instead of inside list comp + all_special_ids.remove(self.unk_token_id) # is only sometimes special + assert all_special_ids == set([0, 1]) + return [1 if x in all_special_ids else 0 for x in seq] + + def get_special_tokens_mask( + self, token_ids_0: List, token_ids_1: Optional[List] = None, already_has_special_tokens: bool = False + ) -> List[int]: + """Get list where entries are [1] if a token is [eos] or [pad] else 0.""" + if already_has_special_tokens: + return self._special_token_mask(token_ids_0) + elif token_ids_1 is None: + return self._special_token_mask(token_ids_0) + [1] + else: + return self._special_token_mask(token_ids_0 + token_ids_1) + [1] + + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> List[int]: + """ + Build model inputs from a sequence by adding eos to the end. no bos token is added to the front. + - single sequence: ``X `` + - pair of sequences: ``A B `` (not intended use) + + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs to which the special tokens will be added + token_ids_1 (:obj:`List[int]`, `optional`): + Optional second list of IDs for sequence pairs. + + Returns: + :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. + """ + if token_ids_1 is None: + return token_ids_0 + [self.eos_token_id] + # We don't expect to process pairs, but leave the pair logic for API consistency + return token_ids_0 + token_ids_1 + [self.eos_token_id] + + @add_start_docstrings(PREPARE_SEQ2SEQ_BATCH_DOCSTRING) + def prepare_seq2seq_batch( + self, + src_texts: List[str], + tgt_texts: Optional[List[str]] = None, + max_length: Optional[int] = None, + max_target_length: Optional[int] = None, + return_tensors: str = "pt", + truncation=True, + padding="longest", + **unused, + ) -> BatchEncoding: + if "" in src_texts: + raise ValueError(f"found empty string in src_texts: {src_texts}") + tokenizer_kwargs = dict( + add_special_tokens=True, + return_tensors=return_tensors, + max_length=max_length, + truncation=truncation, + padding=padding, + ) + model_inputs: BatchEncoding = self(src_texts, **tokenizer_kwargs) + if tgt_texts is None: + return model_inputs + if max_target_length is not None: + tokenizer_kwargs["max_length"] = max_target_length + labels: BatchEncoding = self(tgt_texts, **tokenizer_kwargs)["input_ids"] + model_inputs["labels"] = labels + return model_inputs diff --git a/src/transformers/tokenization_phobert.py b/src/transformers/tokenization_phobert.py index b09fbd1ba36..7b7418bcfb6 100644 --- a/src/transformers/tokenization_phobert.py +++ b/src/transformers/tokenization_phobert.py @@ -19,7 +19,7 @@ import os import re from shutil import copyfile -from typing import List, Optional +from typing import List, Optional, Tuple from .tokenization_utils import PreTrainedTokenizer from .utils import logging @@ -311,22 +311,16 @@ class PhobertTokenizer(PreTrainedTokenizer): out_string = " ".join(tokens).replace("@@ ", "").strip() return out_string - def save_vocabulary(self, save_directory): - """ - Save the vocabulary and special tokens file to a directory. - - Args: - save_directory (:obj:`str`): - The directory in which to save the vocabulary. - - Returns: - :obj:`Tuple(str)`: Paths to the files saved. - """ + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: if not os.path.isdir(save_directory): logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) return - out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"]) - out_merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES["merges_file"]) + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + out_merge_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"] + ) if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): copyfile(self.vocab_file, out_vocab_file) diff --git a/src/transformers/tokenization_reformer.py b/src/transformers/tokenization_reformer.py index a2b2e78568d..2c2abf87dca 100644 --- a/src/transformers/tokenization_reformer.py +++ b/src/transformers/tokenization_reformer.py @@ -17,10 +17,11 @@ import os from shutil import copyfile -from typing import Dict +from typing import Dict, Optional, Tuple + +import sentencepiece as spm from .tokenization_utils import PreTrainedTokenizer -from .tokenization_utils_fast import PreTrainedTokenizerFast from .utils import logging @@ -102,16 +103,6 @@ class ReformerTokenizer(PreTrainedTokenizer): **kwargs, ) - try: - import sentencepiece as spm - except ImportError: - logger.warning( - "You need to install SentencePiece to use ReformerTokenizer:" - "https://github.com/google/sentencepiece" - "pip install sentencepiece" - ) - raise - self.vocab_file = vocab_file self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(vocab_file) @@ -132,14 +123,6 @@ class ReformerTokenizer(PreTrainedTokenizer): def __setstate__(self, d): self.__dict__ = d - try: - import sentencepiece as spm - except ImportError: - logger.warning( - "You need to install SentencePiece to use ReformerTokenizer: https://github.com/google/sentencepiece" - "pip install sentencepiece" - ) - raise self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(self.vocab_file) @@ -166,91 +149,14 @@ class ReformerTokenizer(PreTrainedTokenizer): out_string = self.sp_model.decode_pieces(tokens) return out_string - def save_vocabulary(self, save_directory): - """ - Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory. - - Args: - save_directory (:obj:`str`): - The directory in which to save the vocabulary. - - Returns: - :obj:`Tuple(str)`: Paths to the files saved. - """ + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: if not os.path.isdir(save_directory): logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) return - out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"]) - - if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): - copyfile(self.vocab_file, out_vocab_file) - - return (out_vocab_file,) - - -class ReformerTokenizerFast(PreTrainedTokenizerFast): - """ - Construct a "fast" Reformer tokenizer (backed by HuggingFace's `tokenizers` library). Based on `SentencePiece - `__ . - - This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main - methods. Users should refer to this superclass for more information regarding those methods. - - Args: - vocab_file (:obj:`str`): - `SentencePiece `__ file (generally has a `.spm` extension) that - contains the vocabulary necessary to instantiate a tokenizer. - eos_token (:obj:`str`, `optional`, defaults to :obj:`""`): - The end of sequence token. - - .. note:: - - When building a sequence using special tokens, this is not the token that is used for the end - of sequence. The token used is the :obj:`sep_token`. - unk_token (:obj:`str`, `optional`, defaults to :obj:`""`): - The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this - token instead. - pad_token (:obj:`str`, `optional`, defaults to :obj:`""`): - The token used for padding, for example when batching sequences of different lengths. - additional_special_tokens (:obj:`List[str]`, `optional`): - Additional special tokens used by the tokenizer. - """ - - vocab_files_names = VOCAB_FILES_NAMES - pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP - max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - model_input_names = ["attention_mask"] - slow_tokenizer_class = ReformerTokenizer - - def __init__( - self, - vocab_file, - eos_token="", - unk_token="", - pad_token="", - additional_special_tokens=[], - **kwargs - ): - super().__init__( - vocab_file, - eos_token=eos_token, - unk_token=unk_token, - pad_token=pad_token, - additional_special_tokens=additional_special_tokens, - **kwargs, + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] ) - self.vocab_file = vocab_file - - def save_vocabulary(self, save_directory): - """Save the sentencepiece vocabulary (copy original file) and special tokens file - to a directory. - """ - if not os.path.isdir(save_directory): - logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) - return - out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"]) - if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): copyfile(self.vocab_file, out_vocab_file) diff --git a/src/transformers/tokenization_reformer_fast.py b/src/transformers/tokenization_reformer_fast.py new file mode 100644 index 00000000000..0a6beecbc78 --- /dev/null +++ b/src/transformers/tokenization_reformer_fast.py @@ -0,0 +1,132 @@ +# coding=utf-8 +# Copyright 2020 The Trax Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Tokenization class for model Reformer.""" + + +import os +from shutil import copyfile +from typing import Optional, Tuple + +from .file_utils import is_sentencepiece_available +from .tokenization_utils_fast import PreTrainedTokenizerFast +from .utils import logging + + +if is_sentencepiece_available(): + from .tokenization_reformer import ReformerTokenizer +else: + ReformerTokenizer = None + + +logger = logging.get_logger(__name__) + +SPIECE_UNDERLINE = "▁" + + +#################################################### +# Mapping from the keyword arguments names of Tokenizer `__init__` +# to file names for serializing Tokenizer instances +#################################################### +VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"} + +#################################################### +# Mapping from the keyword arguments names of Tokenizer `__init__` +# to pretrained vocabulary URL for all the model shortcut names. +#################################################### +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": { + "google/reformer-crime-and-punishment": "https://cdn.huggingface.co/google/reformer-crime-and-punishment/spiece.model" + }, + "tokenizer_file": { + "google/reformer-crime-and-punishment": "https://cdn.huggingface.co/google/reformer-crime-and-punishment/tokenizer.json" + }, +} + +#################################################### +# Mapping from model shortcut names to max length of inputs +#################################################### +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + "google/reformer-crime-and-punishment": 524288, +} + + +class ReformerTokenizerFast(PreTrainedTokenizerFast): + """ + Construct a "fast" Reformer tokenizer (backed by HuggingFace's `tokenizers` library). Based on `SentencePiece + `__ . + + This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main + methods. Users should refer to this superclass for more information regarding those methods. + + Args: + vocab_file (:obj:`str`): + `SentencePiece `__ file (generally has a `.spm` extension) that + contains the vocabulary necessary to instantiate a tokenizer. + eos_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The end of sequence token. + + .. note:: + + When building a sequence using special tokens, this is not the token that is used for the end + of sequence. The token used is the :obj:`sep_token`. + unk_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + pad_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The token used for padding, for example when batching sequences of different lengths. + additional_special_tokens (:obj:`List[str]`, `optional`): + Additional special tokens used by the tokenizer. + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + model_input_names = ["attention_mask"] + slow_tokenizer_class = ReformerTokenizer + + def __init__( + self, + vocab_file, + tokenizer_file=None, + eos_token="", + unk_token="", + pad_token="", + additional_special_tokens=[], + **kwargs + ): + super().__init__( + vocab_file, + tokenizer_file=tokenizer_file, + eos_token=eos_token, + unk_token=unk_token, + pad_token=pad_token, + additional_special_tokens=additional_special_tokens, + **kwargs, + ) + + self.vocab_file = vocab_file + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + if not os.path.isdir(save_directory): + logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): + copyfile(self.vocab_file, out_vocab_file) + + return (out_vocab_file,) diff --git a/src/transformers/tokenization_retribert.py b/src/transformers/tokenization_retribert.py index 58c3722d76a..9da6e2ad61c 100644 --- a/src/transformers/tokenization_retribert.py +++ b/src/transformers/tokenization_retribert.py @@ -14,7 +14,7 @@ # limitations under the License. """Tokenization classes for RetriBERT.""" -from .tokenization_bert import BertTokenizer, BertTokenizerFast +from .tokenization_bert import BertTokenizer from .utils import logging @@ -54,22 +54,3 @@ class RetriBertTokenizer(BertTokenizer): max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION model_input_names = ["attention_mask"] - - -class RetriBertTokenizerFast(BertTokenizerFast): - r""" - Construct a "fast" RetriBERT tokenizer (backed by HuggingFace's `tokenizers` library). - - :class:`~transformers.RetriBertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs - end-to-end tokenization: punctuation splitting and wordpiece. - - Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning - parameters. - """ - - vocab_files_names = VOCAB_FILES_NAMES - pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP - max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION - slow_tokenizer_class = RetriBertTokenizer - model_input_names = ["attention_mask"] diff --git a/src/transformers/tokenization_retribert_fast.py b/src/transformers/tokenization_retribert_fast.py new file mode 100644 index 00000000000..1d9cf5feb6e --- /dev/null +++ b/src/transformers/tokenization_retribert_fast.py @@ -0,0 +1,61 @@ +# coding=utf-8 +# Copyright 2018 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for RetriBERT.""" + +from .tokenization_bert_fast import BertTokenizerFast +from .tokenization_retribert import RetriBertTokenizer +from .utils import logging + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"} + +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": { + "yjernite/retribert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt", + }, + "tokenizer_file": { + "yjernite/retribert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-tokenizer.json", + }, +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + "yjernite/retribert-base-uncased": 512, +} + + +PRETRAINED_INIT_CONFIGURATION = { + "yjernite/retribert-base-uncased": {"do_lower_case": True}, +} + + +class RetriBertTokenizerFast(BertTokenizerFast): + r""" + Construct a "fast" RetriBERT tokenizer (backed by HuggingFace's `tokenizers` library). + + :class:`~transformers.RetriBertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs + end-to-end tokenization: punctuation splitting and wordpiece. + + Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning + parameters. + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION + slow_tokenizer_class = RetriBertTokenizer + model_input_names = ["attention_mask"] diff --git a/src/transformers/tokenization_roberta.py b/src/transformers/tokenization_roberta.py index 4b009964144..79e585c7bed 100644 --- a/src/transformers/tokenization_roberta.py +++ b/src/transformers/tokenization_roberta.py @@ -17,7 +17,7 @@ import warnings from typing import List, Optional -from .tokenization_gpt2 import GPT2Tokenizer, GPT2TokenizerFast +from .tokenization_gpt2 import GPT2Tokenizer from .tokenization_utils import AddedToken from .utils import logging @@ -263,143 +263,3 @@ class RobertaTokenizer(GPT2Tokenizer): if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()): text = " " + text return (text, kwargs) - - -class RobertaTokenizerFast(GPT2TokenizerFast): - """ - Construct a "fast" RoBERTa tokenizer (backed by HuggingFace's `tokenizers` library), derived from the GPT-2 - tokenizer, using byte-level Byte-Pair-Encoding. - - This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will - be encoded differently whether it is at the beginning of the sentence (without space) or not: - - :: - - >>> from transformers import RobertaTokenizerFast - >>> tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base") - >>> tokenizer("Hello world")['input_ids'] - [0, 31414, 232, 328, 2] - >>> tokenizer(" Hello world")['input_ids'] - [0, 20920, 232, 2] - - You can get around that behavior by passing ``add_prefix_space=True`` when instantiating this tokenizer or when you - call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance. - - .. note:: - - When used with ``is_split_into_words=True``, this tokenizer needs to be instantiated with - ``add_prefix_space=True``. - - This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main - methods. Users should refer to this superclass for more information regarding those methods. - - Args: - vocab_file (:obj:`str`): - Path to the vocabulary file. - merges_file (:obj:`str`): - Path to the merges file. - errors (:obj:`str`, `optional`, defaults to :obj:`"replace"`): - Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode - `__ for more information. - bos_token (:obj:`str`, `optional`, defaults to :obj:`""`): - The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. - - .. note:: - - When building a sequence using special tokens, this is not the token that is used for the beginning - of sequence. The token used is the :obj:`cls_token`. - eos_token (:obj:`str`, `optional`, defaults to :obj:`""`): - The end of sequence token. - - .. note:: - - When building a sequence using special tokens, this is not the token that is used for the end - of sequence. The token used is the :obj:`sep_token`. - sep_token (:obj:`str`, `optional`, defaults to :obj:`""`): - The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences - for sequence classification or for a text and a question for question answering. - It is also used as the last token of a sequence built with special tokens. - cls_token (:obj:`str`, `optional`, defaults to :obj:`""`): - The classifier token which is used when doing sequence classification (classification of the whole - sequence instead of per-token classification). It is the first token of the sequence when built with - special tokens. - unk_token (:obj:`str`, `optional`, defaults to :obj:`""`): - The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this - token instead. - pad_token (:obj:`str`, `optional`, defaults to :obj:`""`): - The token used for padding, for example when batching sequences of different lengths. - mask_token (:obj:`str`, `optional`, defaults to :obj:`""`): - The token used for masking values. This is the token used when training this model with masked language - modeling. This is the token which the model will try to predict. - add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`False`): - Whether or not to add an initial space to the input. This allows to treat the leading word just as any - other word. (RoBERTa tokenizer detect beginning of words by the preceding space). - trim_offsets (:obj:`bool`, `optional`, defaults to :obj:`True`): - Whether the post processing step should trim offsets to avoid including whitespaces. - """ - - vocab_files_names = VOCAB_FILES_NAMES - pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP - max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - model_input_names = ["attention_mask"] - slow_tokenizer_class = RobertaTokenizer - - def __init__( - self, - vocab_file, - merges_file, - errors="replace", - bos_token="", - eos_token="", - sep_token="", - cls_token="", - unk_token="", - pad_token="", - mask_token="", - add_prefix_space=False, - **kwargs - ): - super().__init__( - vocab_file, - merges_file, - errors=errors, - bos_token=bos_token, - eos_token=eos_token, - sep_token=sep_token, - cls_token=cls_token, - unk_token=unk_token, - pad_token=pad_token, - mask_token=mask_token, - add_prefix_space=add_prefix_space, - **kwargs, - ) - - def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): - output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id] - if token_ids_1 is None: - return output - - return output + [self.eos_token_id] + token_ids_1 + [self.eos_token_id] - - def create_token_type_ids_from_sequences( - self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None - ) -> List[int]: - """ - Create a mask from the two sequences passed to be used in a sequence-pair classification task. - RoBERTa does not make use of token type ids, therefore a list of zeros is returned. - - Args: - token_ids_0 (:obj:`List[int]`): - List of IDs. - token_ids_1 (:obj:`List[int]`, `optional`): - Optional second list of IDs for sequence pairs. - - Returns: - :obj:`List[int]`: List of zeros. - """ - sep = [self.sep_token_id] - cls = [self.cls_token_id] - - if token_ids_1 is None: - return len(cls + token_ids_0 + sep) * [0] - return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] diff --git a/src/transformers/tokenization_roberta_fast.py b/src/transformers/tokenization_roberta_fast.py new file mode 100644 index 00000000000..89bf2993689 --- /dev/null +++ b/src/transformers/tokenization_roberta_fast.py @@ -0,0 +1,204 @@ +# coding=utf-8 +# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Fast Tokenization classes for RoBERTa.""" + +from typing import List, Optional + +from .tokenization_gpt2_fast import GPT2TokenizerFast +from .tokenization_roberta import RobertaTokenizer +from .utils import logging + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"} + +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": { + "roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json", + "roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json", + "roberta-large-mnli": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-vocab.json", + "distilroberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-vocab.json", + "roberta-base-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json", + "roberta-large-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json", + }, + "merges_file": { + "roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt", + "roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt", + "roberta-large-mnli": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-merges.txt", + "distilroberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-merges.txt", + "roberta-base-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt", + "roberta-large-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt", + }, + "tokenizer_file": { + "roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-tokenizer.json", + "roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-tokenizer.json", + "roberta-large-mnli": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-tokenizer.json", + "distilroberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-tokenizer.json", + "roberta-base-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-tokenizer.json", + "roberta-large-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-tokenizer.json", + }, +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + "roberta-base": 512, + "roberta-large": 512, + "roberta-large-mnli": 512, + "distilroberta-base": 512, + "roberta-base-openai-detector": 512, + "roberta-large-openai-detector": 512, +} + + +class RobertaTokenizerFast(GPT2TokenizerFast): + """ + Construct a "fast" RoBERTa tokenizer (backed by HuggingFace's `tokenizers` library), derived from the GPT-2 + tokenizer, using byte-level Byte-Pair-Encoding. + + This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will + be encoded differently whether it is at the beginning of the sentence (without space) or not: + + :: + + >>> from transformers import RobertaTokenizerFast + >>> tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base") + >>> tokenizer("Hello world")['input_ids'] + [0, 31414, 232, 328, 2] + >>> tokenizer(" Hello world")['input_ids'] + [0, 20920, 232, 2] + + You can get around that behavior by passing ``add_prefix_space=True`` when instantiating this tokenizer or when you + call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance. + + .. note:: + + When used with ``is_split_into_words=True``, this tokenizer needs to be instantiated with + ``add_prefix_space=True``. + + This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main + methods. Users should refer to this superclass for more information regarding those methods. + + Args: + vocab_file (:obj:`str`): + Path to the vocabulary file. + merges_file (:obj:`str`): + Path to the merges file. + errors (:obj:`str`, `optional`, defaults to :obj:`"replace"`): + Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode + `__ for more information. + bos_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. + + .. note:: + + When building a sequence using special tokens, this is not the token that is used for the beginning + of sequence. The token used is the :obj:`cls_token`. + eos_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The end of sequence token. + + .. note:: + + When building a sequence using special tokens, this is not the token that is used for the end + of sequence. The token used is the :obj:`sep_token`. + sep_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences + for sequence classification or for a text and a question for question answering. + It is also used as the last token of a sequence built with special tokens. + cls_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The classifier token which is used when doing sequence classification (classification of the whole + sequence instead of per-token classification). It is the first token of the sequence when built with + special tokens. + unk_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + pad_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The token used for padding, for example when batching sequences of different lengths. + mask_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to add an initial space to the input. This allows to treat the leading word just as any + other word. (RoBERTa tokenizer detect beginning of words by the preceding space). + trim_offsets (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether the post processing step should trim offsets to avoid including whitespaces. + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + model_input_names = ["attention_mask"] + slow_tokenizer_class = RobertaTokenizer + + def __init__( + self, + vocab_file, + merges_file, + tokenizer_file=None, + errors="replace", + bos_token="", + eos_token="", + sep_token="", + cls_token="", + unk_token="", + pad_token="", + mask_token="", + add_prefix_space=False, + **kwargs + ): + super().__init__( + vocab_file, + merges_file, + tokenizer_file=tokenizer_file, + errors=errors, + bos_token=bos_token, + eos_token=eos_token, + sep_token=sep_token, + cls_token=cls_token, + unk_token=unk_token, + pad_token=pad_token, + mask_token=mask_token, + add_prefix_space=add_prefix_space, + **kwargs, + ) + + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id] + if token_ids_1 is None: + return output + + return output + [self.eos_token_id] + token_ids_1 + [self.eos_token_id] + + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. + RoBERTa does not make use of token type ids, therefore a list of zeros is returned. + + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs. + token_ids_1 (:obj:`List[int]`, `optional`): + Optional second list of IDs for sequence pairs. + + Returns: + :obj:`List[int]`: List of zeros. + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] diff --git a/src/transformers/tokenization_squeezebert.py b/src/transformers/tokenization_squeezebert.py index dc341226143..285be79a321 100644 --- a/src/transformers/tokenization_squeezebert.py +++ b/src/transformers/tokenization_squeezebert.py @@ -14,7 +14,7 @@ # limitations under the License. """Tokenization classes for SqueezeBERT.""" -from .tokenization_bert import BertTokenizer, BertTokenizerFast +from .tokenization_bert import BertTokenizer from .utils import logging @@ -59,20 +59,3 @@ class SqueezeBertTokenizer(BertTokenizer): pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION - - -class SqueezeBertTokenizerFast(BertTokenizerFast): - r""" - Constructs a "Fast" SqueezeBert tokenizer (backed by HuggingFace's `tokenizers` library). - - :class:`~transformers.SqueezeBertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and - runs end-to-end tokenization: punctuation splitting + wordpiece. - - Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning - parameters. - """ - - vocab_files_names = VOCAB_FILES_NAMES - pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP - max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION diff --git a/src/transformers/tokenization_squeezebert_fast.py b/src/transformers/tokenization_squeezebert_fast.py new file mode 100644 index 00000000000..677b7e40fe7 --- /dev/null +++ b/src/transformers/tokenization_squeezebert_fast.py @@ -0,0 +1,68 @@ +# coding=utf-8 +# Copyright 2020 The SqueezeBert authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for SqueezeBERT.""" + +from .tokenization_bert_fast import BertTokenizerFast +from .tokenization_squeezebert import SqueezeBertTokenizer +from .utils import logging + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"} + +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": { + "squeezebert/squeezebert-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/squeezebert/squeezebert-uncased/vocab.txt", + "squeezebert/squeezebert-mnli": "https://s3.amazonaws.com/models.huggingface.co/bert/squeezebert/squeezebert-mnli/vocab.txt", + "squeezebert/squeezebert-mnli-headless": "https://s3.amazonaws.com/models.huggingface.co/bert/squeezebert/squeezebert-mnli-headless/vocab.txt", + }, + "tokenizer_file": { + "squeezebert/squeezebert-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/squeezebert/squeezebert-uncased/tokenizer.json", + "squeezebert/squeezebert-mnli": "https://s3.amazonaws.com/models.huggingface.co/bert/squeezebert/squeezebert-mnli/tokenizer.json", + "squeezebert/squeezebert-mnli-headless": "https://s3.amazonaws.com/models.huggingface.co/bert/squeezebert/squeezebert-mnli-headless/tokenizer.json", + }, +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + "squeezebert/squeezebert-uncased": 512, + "squeezebert/squeezebert-mnli": 512, + "squeezebert/squeezebert-mnli-headless": 512, +} + + +PRETRAINED_INIT_CONFIGURATION = { + "squeezebert/squeezebert-uncased": {"do_lower_case": True}, + "squeezebert/squeezebert-mnli": {"do_lower_case": True}, + "squeezebert/squeezebert-mnli-headless": {"do_lower_case": True}, +} + + +class SqueezeBertTokenizerFast(BertTokenizerFast): + r""" + Constructs a "Fast" SqueezeBert tokenizer (backed by HuggingFace's `tokenizers` library). + + :class:`~transformers.SqueezeBertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and + runs end-to-end tokenization: punctuation splitting + wordpiece. + + Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning + parameters. + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION + slow_tokenizer_class = SqueezeBertTokenizer diff --git a/src/transformers/tokenization_t5.py b/src/transformers/tokenization_t5.py index 5d885ab114d..1502d328987 100644 --- a/src/transformers/tokenization_t5.py +++ b/src/transformers/tokenization_t5.py @@ -19,12 +19,13 @@ import os import re import warnings from shutil import copyfile -from typing import List, Optional +from typing import List, Optional, Tuple + +import sentencepiece as spm from .file_utils import add_start_docstrings from .tokenization_utils import BatchEncoding, PreTrainedTokenizer from .tokenization_utils_base import PREPARE_SEQ2SEQ_BATCH_DOCSTRING -from .tokenization_utils_fast import PreTrainedTokenizerFast from .utils import logging @@ -124,16 +125,6 @@ class T5Tokenizer(PreTrainedTokenizer): **kwargs, ) - try: - import sentencepiece as spm - except ImportError: - logger.warning( - "You need to install SentencePiece to use T5Tokenizer:" - "https://github.com/google/sentencepiece" - "pip install sentencepiece" - ) - raise - self.vocab_file = vocab_file self._extra_ids = extra_ids @@ -223,14 +214,6 @@ class T5Tokenizer(PreTrainedTokenizer): def __setstate__(self, d): self.__dict__ = d - try: - import sentencepiece as spm - except ImportError: - logger.warning( - "You need to install SentencePiece to use T5Tokenizer: https://github.com/google/sentencepiece" - "pip install sentencepiece" - ) - raise self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(self.vocab_file) @@ -263,21 +246,13 @@ class T5Tokenizer(PreTrainedTokenizer): out_string = self.sp_model.decode_pieces(tokens) return out_string - def save_vocabulary(self, save_directory): - """ - Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory. - - Args: - save_directory (:obj:`str`): - The directory in which to save the vocabulary. - - Returns: - :obj:`Tuple(str)`: Paths to the files saved. - """ + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: if not os.path.isdir(save_directory): logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) return - out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"]) + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): copyfile(self.vocab_file, out_vocab_file) @@ -323,161 +298,3 @@ class T5Tokenizer(PreTrainedTokenizer): ) model_inputs["labels"] = labels_and_decoder_mask["input_ids"] return model_inputs - - -class T5TokenizerFast(PreTrainedTokenizerFast): - """ - Construct a "fast" T5 tokenizer (backed by HuggingFace's `tokenizers` library). Based on `SentencePiece - `__ . - - This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main - methods. Users should refer to this superclass for more information regarding those methods. - - Args: - vocab_file (:obj:`str`): - `SentencePiece `__ file (generally has a `.spm` extension) that - contains the vocabulary necessary to instantiate a tokenizer. - eos_token (:obj:`str`, `optional`, defaults to :obj:`""`): - The end of sequence token. - - .. note:: - - When building a sequence using special tokens, this is not the token that is used for the end - of sequence. The token used is the :obj:`sep_token`. - unk_token (:obj:`str`, `optional`, defaults to :obj:`""`): - The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this - token instead. - pad_token (:obj:`str`, `optional`, defaults to :obj:`""`): - The token used for padding, for example when batching sequences of different lengths. - extra_ids (:obj:`int`, `optional`, defaults to 100): - Add a number of extra ids added to the end of the vocabulary for use as sentinels. - These tokens are accessible as "" where "{%d}" is a number between 0 and extra_ids-1. - Extra tokens are indexed from the end of the vocabulary up to beginnning ("" is the last token - in the vocabulary like in T5 preprocessing see `here - `__). - additional_special_tokens (:obj:`List[str]`, `optional`): - Additional special tokens used by the tokenizer. - """ - - vocab_files_names = VOCAB_FILES_NAMES - pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP - max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - model_input_names = ["attention_mask"] - slow_tokenizer_class = T5Tokenizer - - prefix_tokens: List[int] = [] - - def __init__( - self, - vocab_file, - eos_token="", - unk_token="", - pad_token="", - extra_ids=100, - additional_special_tokens=None, - **kwargs - ): - super().__init__( - vocab_file, - eos_token=eos_token, - unk_token=unk_token, - pad_token=pad_token, - extra_ids=extra_ids, - additional_special_tokens=additional_special_tokens, - **kwargs, - ) - - self.vocab_file = vocab_file - self._extra_ids = extra_ids - - def save_vocabulary(self, save_directory): - """ - Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory. - - Args: - save_directory (:obj:`str`): - The directory in which to save the vocabulary. - - Returns: - :obj:`Tuple(str)`: Paths to the files saved. - """ - if not os.path.isdir(save_directory): - logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) - return - out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"]) - - if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): - copyfile(self.vocab_file, out_vocab_file) - - return (out_vocab_file,) - - def build_inputs_with_special_tokens( - self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None - ) -> List[int]: - """ - Build model inputs from a sequence or a pair of sequence for sequence classification tasks - by concatenating and adding special tokens. - A sequence has the following format: - - - single sequence: ``X `` - - pair of sequences: ``A B `` - - Args: - token_ids_0 (:obj:`List[int]`): - List of IDs to which the special tokens will be added. - token_ids_1 (:obj:`List[int]`, `optional`): - Optional second list of IDs for sequence pairs. - - Returns: - :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. - """ - token_ids_0 = token_ids_0 + [self.eos_token_id] - if token_ids_1 is None: - return self.prefix_tokens + token_ids_0 - else: - token_ids_1 = token_ids_1 + [self.eos_token_id] - return self.prefix_tokens + token_ids_0 + token_ids_1 - - @add_start_docstrings(PREPARE_SEQ2SEQ_BATCH_DOCSTRING) - def prepare_seq2seq_batch( - self, - src_texts: List[str], - tgt_texts: Optional[List[str]] = None, - max_length: Optional[int] = None, - max_target_length: Optional[int] = None, - padding: str = "longest", - return_tensors: str = None, - truncation: bool = True, - **kwargs, - ) -> BatchEncoding: - if max_length is None: - max_length = self.max_len - self.prefix_tokens = [] - model_inputs = self( - src_texts, - add_special_tokens=True, - return_tensors=return_tensors, - max_length=max_length, - padding=padding, - truncation=truncation, - **kwargs, - ) - if tgt_texts is None: - return model_inputs - # Process tgt_texts - if max_target_length is None: - max_target_length = max_length - # set prefix_tokens for target text - self.prefix_tokens = [self.pad_token_id] - labels_and_decoder_mask = self( - tgt_texts, - add_special_tokens=True, - return_tensors=return_tensors, - padding=padding, - max_length=max_target_length, - truncation=truncation, - **kwargs, - ) - model_inputs["labels"] = labels_and_decoder_mask["input_ids"] - self.prefix_tokens = [] - return model_inputs diff --git a/src/transformers/tokenization_t5_fast.py b/src/transformers/tokenization_t5_fast.py new file mode 100644 index 00000000000..b972b439ffd --- /dev/null +++ b/src/transformers/tokenization_t5_fast.py @@ -0,0 +1,232 @@ +# coding=utf-8 +# Copyright 2018 T5 Authors and HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Tokenization class for model T5.""" + + +import os +from shutil import copyfile +from typing import List, Optional, Tuple + +from .file_utils import add_start_docstrings, is_sentencepiece_available +from .tokenization_utils import BatchEncoding +from .tokenization_utils_base import PREPARE_SEQ2SEQ_BATCH_DOCSTRING +from .tokenization_utils_fast import PreTrainedTokenizerFast +from .utils import logging + + +if is_sentencepiece_available(): + from .tokenization_t5 import T5Tokenizer +else: + T5Tokenizer = None + + +logger = logging.get_logger(__name__) + +#################################################### +# Mapping from the keyword arguments names of Tokenizer `__init__` +# to file names for serializing Tokenizer instances +#################################################### +VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"} + +#################################################### +# Mapping from the keyword arguments names of Tokenizer `__init__` +# to pretrained vocabulary URL for all the model shortcut names. +#################################################### +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": { + "t5-small": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model", + "t5-base": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model", + "t5-large": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model", + "t5-3b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model", + "t5-11b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model", + }, + "tokenizer_file": { + "t5-small": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-tokenizer.json", + "t5-base": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-tokenizer.json", + "t5-large": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-tokenizer.json", + "t5-3b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-tokenizer.json", + "t5-11b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-tokenizer.json", + }, +} + +#################################################### +# Mapping from model shortcut names to max length of inputs +#################################################### +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + "t5-small": 512, + "t5-base": 512, + "t5-large": 512, + "t5-3b": 512, + "t5-11b": 512, +} + + +class T5TokenizerFast(PreTrainedTokenizerFast): + """ + Construct a "fast" T5 tokenizer (backed by HuggingFace's `tokenizers` library). Based on `SentencePiece + `__ . + + This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main + methods. Users should refer to this superclass for more information regarding those methods. + + Args: + vocab_file (:obj:`str`): + `SentencePiece `__ file (generally has a `.spm` extension) that + contains the vocabulary necessary to instantiate a tokenizer. + eos_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The end of sequence token. + + .. note:: + + When building a sequence using special tokens, this is not the token that is used for the end + of sequence. The token used is the :obj:`sep_token`. + unk_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + pad_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The token used for padding, for example when batching sequences of different lengths. + extra_ids (:obj:`int`, `optional`, defaults to 100): + Add a number of extra ids added to the end of the vocabulary for use as sentinels. + These tokens are accessible as "" where "{%d}" is a number between 0 and extra_ids-1. + Extra tokens are indexed from the end of the vocabulary up to beginnning ("" is the last token + in the vocabulary like in T5 preprocessing see `here + `__). + additional_special_tokens (:obj:`List[str]`, `optional`): + Additional special tokens used by the tokenizer. + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + model_input_names = ["attention_mask"] + slow_tokenizer_class = T5Tokenizer + + prefix_tokens: List[int] = [] + + def __init__( + self, + vocab_file, + tokenizer_file=None, + eos_token="", + unk_token="", + pad_token="", + extra_ids=100, + additional_special_tokens=None, + **kwargs + ): + super().__init__( + vocab_file, + tokenizer_file=tokenizer_file, + eos_token=eos_token, + unk_token=unk_token, + pad_token=pad_token, + extra_ids=extra_ids, + additional_special_tokens=additional_special_tokens, + **kwargs, + ) + + if extra_ids > 0: + all_extra_tokens = ["".format(i) for i in range(extra_ids)] + if all(tok not in self.additional_special_tokens for tok in all_extra_tokens): + self.additional_special_tokens = self.additional_special_tokens + [ + "".format(i) for i in range(extra_ids) + ] + + self.vocab_file = vocab_file + self._extra_ids = extra_ids + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + if not os.path.isdir(save_directory): + logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): + copyfile(self.vocab_file, out_vocab_file) + + return (out_vocab_file,) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks + by concatenating and adding special tokens. + A sequence has the following format: + + - single sequence: ``X `` + - pair of sequences: ``A B `` + + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (:obj:`List[int]`, `optional`): + Optional second list of IDs for sequence pairs. + + Returns: + :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. + """ + token_ids_0 = token_ids_0 + [self.eos_token_id] + if token_ids_1 is None: + return self.prefix_tokens + token_ids_0 + else: + token_ids_1 = token_ids_1 + [self.eos_token_id] + return self.prefix_tokens + token_ids_0 + token_ids_1 + + @add_start_docstrings(PREPARE_SEQ2SEQ_BATCH_DOCSTRING) + def prepare_seq2seq_batch( + self, + src_texts: List[str], + tgt_texts: Optional[List[str]] = None, + max_length: Optional[int] = None, + max_target_length: Optional[int] = None, + padding: str = "longest", + return_tensors: str = None, + truncation: bool = True, + **kwargs, + ) -> BatchEncoding: + if max_length is None: + max_length = self.max_len + self.prefix_tokens = [] + model_inputs = self( + src_texts, + add_special_tokens=True, + return_tensors=return_tensors, + max_length=max_length, + padding=padding, + truncation=truncation, + **kwargs, + ) + if tgt_texts is None: + return model_inputs + # Process tgt_texts + if max_target_length is None: + max_target_length = max_length + # set prefix_tokens for target text + self.prefix_tokens = [self.pad_token_id] + labels_and_decoder_mask = self( + tgt_texts, + add_special_tokens=True, + return_tensors=return_tensors, + padding=padding, + max_length=max_target_length, + truncation=truncation, + **kwargs, + ) + model_inputs["labels"] = labels_and_decoder_mask["input_ids"] + self.prefix_tokens = [] + return model_inputs diff --git a/src/transformers/tokenization_transfo_xl.py b/src/transformers/tokenization_transfo_xl.py index 6e018150d68..e1b4c99ac38 100644 --- a/src/transformers/tokenization_transfo_xl.py +++ b/src/transformers/tokenization_transfo_xl.py @@ -23,7 +23,7 @@ import os import pickle import re from collections import Counter, OrderedDict -from typing import List +from typing import List, Optional, Tuple import numpy as np @@ -276,22 +276,14 @@ class TransfoXLTokenizer(PreTrainedTokenizer): else: raise ValueError("No token in vocabulary") - def save_vocabulary(self, vocab_path): - """ - Save the vocabulary and special tokens file to a directory. - - Args: - vocab_path (:obj:`str`): - The directory in which to save the vocabulary. - - Returns: - :obj:`Tuple(str)`: Paths to the files saved. - """ - - if os.path.isdir(vocab_path): - vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["pretrained_vocab_file"]) + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, + (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["pretrained_vocab_file"], + ) else: - vocab_file = vocab_path + vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory with open(vocab_file, "wb") as f: pickle.dump(self.__dict__, f) return (vocab_file,) diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py index 937e0ebf880..6a2bc7a0db3 100644 --- a/src/transformers/tokenization_utils.py +++ b/src/transformers/tokenization_utils.py @@ -805,23 +805,6 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase): else: return text - def save_vocabulary(self, save_directory) -> Tuple[str]: - """ - Save the tokenizer vocabulary to a directory. This method does *NOT* save added tokens - and special token mappings. - - .. warning:: - Please use :meth:`~transformers.PreTrainedTokenizer.save_pretrained` to save the full tokenizer state if - you want to reload it using the :meth:`~transformers.PreTrainedTokenizer.from_pretrained` class method. - - Args: - save_directory (:obj:`str`): The path to a directory where the tokenizer will be saved. - - Returns: - A tuple of :obj:`str`: The files saved. - """ - raise NotImplementedError - def prepare_seq2seq_batch( self, src_texts: List[str], diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 9bd50a9488a..b2c1797b5e1 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -23,20 +23,19 @@ import json import os import warnings from collections import OrderedDict, UserDict +from dataclasses import dataclass, field from enum import Enum from typing import Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union import numpy as np -from tokenizers import AddedToken -from tokenizers import Encoding as EncodingFast - from .file_utils import ( add_end_docstrings, cached_path, hf_bucket_url, is_remote_url, is_tf_available, + is_tokenizers_available, is_torch_available, torch_required, ) @@ -45,9 +44,36 @@ from .utils import logging if is_tf_available(): import tensorflow as tf + if is_torch_available(): import torch +if is_tokenizers_available(): + from tokenizers import AddedToken + from tokenizers import Encoding as EncodingFast +else: + + @dataclass(frozen=True, eq=True) + class AddedToken: + """AddedToken represents a token to be added to a Tokenizer + An AddedToken can have special options defining the way it should behave. + """ + + content: str = field(default_factory=str) + single_word: bool = False + lstrip: bool = False + rstrip: bool = False + normalized: bool = True + + def __getstate__(self): + return self.__dict__ + + @dataclass + class EncodingFast: + """ This is dummy class because without the `tokenizers` library we don't have these objects anyway """ + + pass + logger = logging.get_logger(__name__) @@ -1304,6 +1330,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): # inputs and kwargs for saving and re-loading (see ``from_pretrained`` and ``save_pretrained``) self.init_inputs = () self.init_kwargs = copy.deepcopy(kwargs) + self.name_or_path = kwargs.pop("name_or_path", "") # For backward compatibility we fallback to set model_max_length from max_len if provided model_max_length = kwargs.pop("model_max_length", kwargs.pop("max_len", None)) @@ -1377,6 +1404,13 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): "Setting 'max_len_sentences_pair' is now deprecated. " "This value is automatically set up." ) + def __repr__(self) -> str: + return ( + f"{'PreTrainedTokenizerFast' if self.is_fast else 'PreTrainedTokenizer'}(name_or_path='{self.name_or_path}', " + f"vocab_size={self.vocab_size}, model_max_len={self.model_max_length}, is_fast={self.is_fast}, " + f"padding_side='{self.padding_side}', special_tokens={self.special_tokens_map_extended})" + ) + @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs): r""" @@ -1562,7 +1596,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): # We instantiate fast tokenizers based on a slow tokenizer for now # In the future we can also use a direct way based on saving/instantiating # tokenizer's Tokenizer directly from it's serialization JSON - if cls.slow_tokenizer_class is not None: + if ( + "tokenizer_file" not in resolved_vocab_files or resolved_vocab_files["tokenizer_file"] is None + ) and cls.slow_tokenizer_class is not None: slow_tokenizer = cls.slow_tokenizer_class._from_pretrained( copy.deepcopy(resolved_vocab_files), pretrained_model_name_or_path, @@ -1618,6 +1654,8 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): if slow_tokenizer is not None: init_kwargs["__slow_tokenizer"] = slow_tokenizer + init_kwargs["name_or_path"] = pretrained_model_name_or_path + # Instantiate tokenizer. try: tokenizer = cls(*init_inputs, **init_kwargs) @@ -1669,7 +1707,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): return tokenizer - def save_pretrained(self, save_directory: str) -> Tuple[str]: + def save_pretrained( + self, save_directory: str, legacy_format: bool = True, filename_prefix: Optional[str] = None + ) -> Tuple[str]: """ Save the full tokenizer state. @@ -1688,7 +1728,14 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): modifying :obj:`tokenizer.do_lower_case` after creation). Args: - save_directory (:obj:`str`): The path to a directory where the tokenizer will be saved. + save_directory (:obj:`str`): The path to adirectory where the tokenizer will be saved. + legacy_format (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether to save the tokenizer in legacy format (default), i.e. with tokenizer specific vocabulary and + a separate added_tokens files or in the unified JSON file format for the `tokenizers` library. + It's only possible to save a Fast tokenizer in the unified JSON format and this format is incompatible + with "slow" tokenizers (not powered by the `tokenizers` library). + filename_prefix: (:obj:`str`, `optional`): + A prefix to add to the names of the files saved by the tokenizer. Returns: A tuple of :obj:`str`: The files saved. @@ -1698,8 +1745,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): return os.makedirs(save_directory, exist_ok=True) - special_tokens_map_file = os.path.join(save_directory, SPECIAL_TOKENS_MAP_FILE) - tokenizer_config_file = os.path.join(save_directory, TOKENIZER_CONFIG_FILE) + special_tokens_map_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + SPECIAL_TOKENS_MAP_FILE + ) + tokenizer_config_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + TOKENIZER_CONFIG_FILE + ) tokenizer_config = copy.deepcopy(self.init_kwargs) if len(self.init_inputs) > 0: @@ -1732,19 +1783,61 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): file_names = (tokenizer_config_file, special_tokens_map_file) - return self._save_pretrained(save_directory, file_names) + return self._save_pretrained( + save_directory=save_directory, + file_names=file_names, + legacy_format=legacy_format, + filename_prefix=filename_prefix, + ) - def _save_pretrained(self, save_directory: str, file_names: Tuple[str]) -> Tuple[str]: - added_tokens_file = os.path.join(save_directory, ADDED_TOKENS_FILE) + def _save_pretrained( + self, + save_directory: str, + file_names: Tuple[str], + legacy_format: bool = True, + filename_prefix: Optional[str] = None, + ) -> Tuple[str]: + """Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens. + + Fast tokenizers can also be saved in a unique JSON file containing {config + vocab + added-tokens} + using the specific :meth:`~transformers.tokenization_utils_fast.PreTrainedTokenizerFast._save_pretrained` + """ + if not legacy_format: + raise ValueError( + "Only fast tokenizers (instances of PretrainedTokenizerFast) can be saved in non legacy format." + ) + + added_tokens_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + ADDED_TOKENS_FILE + ) added_vocab = self.get_added_vocab() if added_vocab: with open(added_tokens_file, "w", encoding="utf-8") as f: out_str = json.dumps(added_vocab, ensure_ascii=False) f.write(out_str) - vocab_files = self.save_vocabulary(save_directory) + vocab_files = self.save_vocabulary(save_directory, filename_prefix=filename_prefix) - return file_names + (vocab_files, added_tokens_file) + return file_names + vocab_files + (added_tokens_file,) + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + """ + Save only the vocabulary of the tokenizer (vocabulary + added tokens). + + This method won't save the configuration and special token mappings of the tokenizer. + Use :meth:`~transformers.PreTrainedTokenizerFast._save_pretrained` to save + the whole state of the tokenizer. + + Args: + save_directory (:obj:`str`): + The directory in which to save the vocabulary. + filename_prefix (:obj:`str`, `optional`): + An optional prefix to add to the named of the saved files. + + Returns: + :obj:`Tuple(str)`: Paths to the files saved. + """ + raise NotImplementedError @add_end_docstrings( ENCODE_KWARGS_DOCSTRING, diff --git a/src/transformers/tokenization_utils_fast.py b/src/transformers/tokenization_utils_fast.py index 4ee41c4f978..6ee3c8a725e 100644 --- a/src/transformers/tokenization_utils_fast.py +++ b/src/transformers/tokenization_utils_fast.py @@ -17,6 +17,7 @@ """ import copy +import json import os import warnings from collections import defaultdict @@ -79,16 +80,32 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): slow_tokenizer_class: PreTrainedTokenizer = None def __init__(self, *args, **kwargs): - # We instantiate fast tokenizers based on a slow tokenizer for now - # In the future we'll also use a direct way based on saving/instantiating - # tokenizer's Tokenizer directly from it's serialization JSON - if "__slow_tokenizer" in kwargs and kwargs["__slow_tokenizer"]: - slow_tokenizer = kwargs.pop("__slow_tokenizer") - else: - slow_tokenizer = self.slow_tokenizer_class(*args, **kwargs) - self._tokenizer = convert_slow_tokenizer(slow_tokenizer) + slow_tokenizer = kwargs.pop("__slow_tokenizer", None) + fast_tokenizer_file = kwargs.pop("tokenizer_file", None) - kwargs = copy.deepcopy(slow_tokenizer.init_kwargs) + if fast_tokenizer_file is not None: + # We have a serialization from tokenizers which let us directly build the backend + fast_tokenizer = TokenizerFast.from_file(fast_tokenizer_file) + elif slow_tokenizer is not None: + # We need to convert a slow tokenizer to build the backend + fast_tokenizer = convert_slow_tokenizer(slow_tokenizer) + elif self.slow_tokenizer_class is not None: + # We need to create and convert a slow tokenizer to build the backend + slow_tokenizer = self.slow_tokenizer_class(*args, **kwargs) + fast_tokenizer = convert_slow_tokenizer(slow_tokenizer) + else: + raise ValueError( + "Couldn't instantiate the backend tokenizer from one of: " + "(1) a `tokenizers` library serialization file, " + "(2) a slow tokenizer instance to convert or " + "(3) an equivalent slow tokenizer class to instantiate and convert. " + "You need to have sentencepiece installed to convert a slow tokenizer to a fast one." + ) + + self._tokenizer = fast_tokenizer + + if slow_tokenizer is not None: + kwargs = copy.deepcopy(slow_tokenizer.init_kwargs) # We call this after having initialized the backend tokenizer because we update it. super().__init__(**kwargs) @@ -542,25 +559,35 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): else: return text - def save_vocabulary(self, save_directory: str) -> Tuple[str]: + def _save_pretrained( + self, + save_directory: str, + file_names: Tuple[str], + legacy_format: bool = True, + filename_prefix: Optional[str] = None, + ) -> Tuple[str]: + """Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens. + + Fast tokenizers can also be saved in a unique JSON file containing {config + vocab + added-tokens} + using the specific :meth:`~transformers.PreTrainedTokenizerFast._save_pretrained` """ - Save the tokenizer vocabulary to a directory. This method does *NOT* save added tokens - and special token mappings. + if legacy_format: + added_tokens_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + ADDED_TOKENS_FILE + ) + added_vocab = self.get_added_vocab() + if added_vocab: + with open(added_tokens_file, "w", encoding="utf-8") as f: + out_str = json.dumps(added_vocab, ensure_ascii=False) + f.write(out_str) - .. warning:: - Please use :meth:`~transformers.PreTrainedTokenizerFast.save_pretrained` to save the full tokenizer state if - you want to reload it using the :meth:`~transformers.PreTrainedTokenizerFast.from_pretrained` class method. - - Args: - save_directory (:obj:`str`): The path to a directory where the tokenizer will be saved. - - Returns: - A tuple of :obj:`str`: The files saved. - """ - if os.path.isdir(save_directory): - files = self._tokenizer.model.save(save_directory) + vocab_files = self.save_vocabulary(save_directory, filename_prefix=filename_prefix) + file_names = file_names + vocab_files + (added_tokens_file,) else: - folder, file = os.path.split(os.path.abspath(save_directory)) - files = self._tokenizer.save_model(folder, name=file) + tokenizer_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + TOKENIZER_FILE + ) + self.backend_tokenizer.save(tokenizer_file) + file_names = file_names + (tokenizer_file,) - return tuple(files) + return file_names diff --git a/src/transformers/tokenization_xlm.py b/src/transformers/tokenization_xlm.py index 9a1286e4422..76a36f38e36 100644 --- a/src/transformers/tokenization_xlm.py +++ b/src/transformers/tokenization_xlm.py @@ -20,7 +20,7 @@ import os import re import sys import unicodedata -from typing import List, Optional +from typing import List, Optional, Tuple import sacremoses as sm @@ -942,22 +942,16 @@ class XLMTokenizer(PreTrainedTokenizer): return len(cls + token_ids_0 + sep) * [0] return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] - def save_vocabulary(self, save_directory): - """ - Save the vocabulary and special tokens file to a directory. - - Args: - save_directory (:obj:`str`): - The directory in which to save the vocabulary. - - Returns: - :obj:`Tuple(str)`: Paths to the files saved. - """ + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: if not os.path.isdir(save_directory): logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) return - vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"]) - merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES["merges_file"]) + vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + merge_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"] + ) with open(vocab_file, "w", encoding="utf-8") as f: f.write(json.dumps(self.encoder, ensure_ascii=False)) diff --git a/src/transformers/tokenization_xlm_roberta.py b/src/transformers/tokenization_xlm_roberta.py index 24139b88116..44052e81857 100644 --- a/src/transformers/tokenization_xlm_roberta.py +++ b/src/transformers/tokenization_xlm_roberta.py @@ -17,10 +17,11 @@ import os from shutil import copyfile -from typing import List, Optional +from typing import List, Optional, Tuple + +import sentencepiece as spm from .tokenization_utils import PreTrainedTokenizer -from .tokenization_utils_fast import PreTrainedTokenizerFast from .tokenization_xlnet import SPIECE_UNDERLINE from .utils import logging @@ -127,15 +128,6 @@ class XLMRobertaTokenizer(PreTrainedTokenizer): **kwargs, ) - try: - import sentencepiece as spm - except ImportError: - logger.warning( - "You need to install SentencePiece to use XLMRobertaTokenizer: https://github.com/google/sentencepiece" - "pip install sentencepiece" - ) - raise - self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(str(vocab_file)) self.vocab_file = vocab_file @@ -162,14 +154,6 @@ class XLMRobertaTokenizer(PreTrainedTokenizer): def __setstate__(self, d): self.__dict__ = d - try: - import sentencepiece as spm - except ImportError: - logger.warning( - "You need to install SentencePiece to use XLMRobertaTokenizer: https://github.com/google/sentencepiece" - "pip install sentencepiece" - ) - raise self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(self.vocab_file) @@ -288,209 +272,14 @@ class XLMRobertaTokenizer(PreTrainedTokenizer): out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip() return out_string - def save_vocabulary(self, save_directory): - """ - Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory. - - Args: - save_directory (:obj:`str`): - The directory in which to save the vocabulary. - - Returns: - :obj:`Tuple(str)`: Paths to the files saved. - """ + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: if not os.path.isdir(save_directory): logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) return - out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"]) - - if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): - copyfile(self.vocab_file, out_vocab_file) - - return (out_vocab_file,) - - -class XLMRobertaTokenizerFast(PreTrainedTokenizerFast): - """ - Construct a "fast" XLM-RoBERTa tokenizer (backed by HuggingFace's `tokenizers` library). Adapted from - :class:`~transfomers.RobertaTokenizer` and class:`~transfomers.XLNetTokenizer`. Based on `SentencePiece - `__. - - This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main - methods. Users should refer to this superclass for more information regarding those methods. - - Args: - vocab_file (:obj:`str`): - Path to the vocabulary file. - bos_token (:obj:`str`, `optional`, defaults to :obj:`""`): - The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. - - .. note:: - - When building a sequence using special tokens, this is not the token that is used for the beginning - of sequence. The token used is the :obj:`cls_token`. - eos_token (:obj:`str`, `optional`, defaults to :obj:`""`): - The end of sequence token. - - .. note:: - - When building a sequence using special tokens, this is not the token that is used for the end - of sequence. The token used is the :obj:`sep_token`. - sep_token (:obj:`str`, `optional`, defaults to :obj:`""`): - The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences - for sequence classification or for a text and a question for question answering. - It is also used as the last token of a sequence built with special tokens. - cls_token (:obj:`str`, `optional`, defaults to :obj:`""`): - The classifier token which is used when doing sequence classification (classification of the whole - sequence instead of per-token classification). It is the first token of the sequence when built with - special tokens. - unk_token (:obj:`str`, `optional`, defaults to :obj:`""`): - The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this - token instead. - pad_token (:obj:`str`, `optional`, defaults to :obj:`""`): - The token used for padding, for example when batching sequences of different lengths. - mask_token (:obj:`str`, `optional`, defaults to :obj:`""`): - The token used for masking values. This is the token used when training this model with masked language - modeling. This is the token which the model will try to predict. - additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["NOTUSED", "NOTUSED"]`): - Additional special tokens used by the tokenizer. - - Attributes: - sp_model (:obj:`SentencePieceProcessor`): - The `SentencePiece` processor that is used for every conversion (string, tokens and IDs). - """ - - vocab_files_names = VOCAB_FILES_NAMES - pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP - max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - model_input_names = ["attention_mask"] - slow_tokenizer_class = XLMRobertaTokenizer - - def __init__( - self, - vocab_file, - bos_token="", - eos_token="", - sep_token="", - cls_token="", - unk_token="", - pad_token="", - mask_token="", - **kwargs - ): - super().__init__( - vocab_file, - bos_token=bos_token, - eos_token=eos_token, - sep_token=sep_token, - cls_token=cls_token, - unk_token=unk_token, - pad_token=pad_token, - mask_token=mask_token, - **kwargs, + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] ) - self.vocab_file = vocab_file - - def build_inputs_with_special_tokens( - self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None - ) -> List[int]: - """ - Build model inputs from a sequence or a pair of sequence for sequence classification tasks - by concatenating and adding special tokens. - An XLM-RoBERTa sequence has the following format: - - - single sequence: `` X `` - - pair of sequences: `` A B `` - - Args: - token_ids_0 (:obj:`List[int]`): - List of IDs to which the special tokens will be added. - token_ids_1 (:obj:`List[int]`, `optional`): - Optional second list of IDs for sequence pairs. - - Returns: - :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. - """ - - if token_ids_1 is None: - return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] - cls = [self.cls_token_id] - sep = [self.sep_token_id] - return cls + token_ids_0 + sep + sep + token_ids_1 + sep - - def get_special_tokens_mask( - self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False - ) -> List[int]: - """ - Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding - special tokens using the tokenizer ``prepare_for_model`` method. - - Args: - token_ids_0 (:obj:`List[int]`): - List of IDs. - token_ids_1 (:obj:`List[int]`, `optional`): - Optional second list of IDs for sequence pairs. - already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): - Whether or not the token list is already formatted with special tokens for the model. - - Returns: - :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. - """ - - if already_has_special_tokens: - if token_ids_1 is not None: - raise ValueError( - "You should not supply a second sequence if the provided sequence of " - "ids is already formated with special tokens for the model." - ) - return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) - - if token_ids_1 is None: - return [1] + ([0] * len(token_ids_0)) + [1] - return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] - - def create_token_type_ids_from_sequences( - self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None - ) -> List[int]: - """ - Create a mask from the two sequences passed to be used in a sequence-pair classification task. - XLM-RoBERTa does not make use of token type ids, therefore a list of zeros is returned. - - Args: - token_ids_0 (:obj:`List[int]`): - List of IDs. - token_ids_1 (:obj:`List[int]`, `optional`): - Optional second list of IDs for sequence pairs. - - Returns: - :obj:`List[int]`: List of zeros. - - """ - - sep = [self.sep_token_id] - cls = [self.cls_token_id] - - if token_ids_1 is None: - return len(cls + token_ids_0 + sep) * [0] - return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] - - def save_vocabulary(self, save_directory): - """ - Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory. - - Args: - save_directory (:obj:`str`): - The directory in which to save the vocabulary. - - Returns: - :obj:`Tuple(str)`: Paths to the files saved. - """ - if not os.path.isdir(save_directory): - logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) - return - out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"]) - if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): copyfile(self.vocab_file, out_vocab_file) diff --git a/src/transformers/tokenization_xlm_roberta_fast.py b/src/transformers/tokenization_xlm_roberta_fast.py new file mode 100644 index 00000000000..a477c641405 --- /dev/null +++ b/src/transformers/tokenization_xlm_roberta_fast.py @@ -0,0 +1,244 @@ +# coding=utf-8 +# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License +""" Tokenization classes for XLM-RoBERTa model.""" + + +import os +from shutil import copyfile +from typing import List, Optional, Tuple + +from .file_utils import is_sentencepiece_available +from .tokenization_utils_fast import PreTrainedTokenizerFast +from .utils import logging + + +if is_sentencepiece_available(): + from .tokenization_xlm_roberta import XLMRobertaTokenizer +else: + XLMRobertaTokenizer = None + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "tokenizer_file": "tokenizer.json"} + +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": { + "xlm-roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-sentencepiece.bpe.model", + "xlm-roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-sentencepiece.bpe.model", + "xlm-roberta-large-finetuned-conll02-dutch": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-dutch-sentencepiece.bpe.model", + "xlm-roberta-large-finetuned-conll02-spanish": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-spanish-sentencepiece.bpe.model", + "xlm-roberta-large-finetuned-conll03-english": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-english-sentencepiece.bpe.model", + "xlm-roberta-large-finetuned-conll03-german": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-german-sentencepiece.bpe.model", + }, + "tokenizer_file": { + "xlm-roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-tokenizer.json", + "xlm-roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-tokenizer.json", + "xlm-roberta-large-finetuned-conll02-dutch": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-dutch-tokenizer.json", + "xlm-roberta-large-finetuned-conll02-spanish": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-spanish-tokenizer.json", + "xlm-roberta-large-finetuned-conll03-english": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-english-tokenizer.json", + "xlm-roberta-large-finetuned-conll03-german": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-german-tokenizer.json", + }, +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + "xlm-roberta-base": 512, + "xlm-roberta-large": 512, + "xlm-roberta-large-finetuned-conll02-dutch": 512, + "xlm-roberta-large-finetuned-conll02-spanish": 512, + "xlm-roberta-large-finetuned-conll03-english": 512, + "xlm-roberta-large-finetuned-conll03-german": 512, +} + + +class XLMRobertaTokenizerFast(PreTrainedTokenizerFast): + """ + Construct a "fast" XLM-RoBERTa tokenizer (backed by HuggingFace's `tokenizers` library). Adapted from + :class:`~transfomers.RobertaTokenizer` and class:`~transfomers.XLNetTokenizer`. Based on `SentencePiece + `__. + + This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main + methods. Users should refer to this superclass for more information regarding those methods. + + Args: + vocab_file (:obj:`str`): + Path to the vocabulary file. + bos_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. + + .. note:: + + When building a sequence using special tokens, this is not the token that is used for the beginning + of sequence. The token used is the :obj:`cls_token`. + eos_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The end of sequence token. + + .. note:: + + When building a sequence using special tokens, this is not the token that is used for the end + of sequence. The token used is the :obj:`sep_token`. + sep_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences + for sequence classification or for a text and a question for question answering. + It is also used as the last token of a sequence built with special tokens. + cls_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The classifier token which is used when doing sequence classification (classification of the whole + sequence instead of per-token classification). It is the first token of the sequence when built with + special tokens. + unk_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + pad_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The token used for padding, for example when batching sequences of different lengths. + mask_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["NOTUSED", "NOTUSED"]`): + Additional special tokens used by the tokenizer. + + Attributes: + sp_model (:obj:`SentencePieceProcessor`): + The `SentencePiece` processor that is used for every conversion (string, tokens and IDs). + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + model_input_names = ["attention_mask"] + slow_tokenizer_class = XLMRobertaTokenizer + + def __init__( + self, + vocab_file, + tokenizer_file=None, + bos_token="", + eos_token="", + sep_token="", + cls_token="", + unk_token="", + pad_token="", + mask_token="", + **kwargs + ): + super().__init__( + vocab_file, + tokenizer_file=tokenizer_file, + bos_token=bos_token, + eos_token=eos_token, + sep_token=sep_token, + cls_token=cls_token, + unk_token=unk_token, + pad_token=pad_token, + mask_token=mask_token, + **kwargs, + ) + + self.vocab_file = vocab_file + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks + by concatenating and adding special tokens. + An XLM-RoBERTa sequence has the following format: + + - single sequence: `` X `` + - pair of sequences: `` A B `` + + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (:obj:`List[int]`, `optional`): + Optional second list of IDs for sequence pairs. + + Returns: + :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. + """ + + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + cls = [self.cls_token_id] + sep = [self.sep_token_id] + return cls + token_ids_0 + sep + sep + token_ids_1 + sep + + def get_special_tokens_mask( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False + ) -> List[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer ``prepare_for_model`` method. + + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs. + token_ids_1 (:obj:`List[int]`, `optional`): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + + if already_has_special_tokens: + if token_ids_1 is not None: + raise ValueError( + "You should not supply a second sequence if the provided sequence of " + "ids is already formated with special tokens for the model." + ) + return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) + + if token_ids_1 is None: + return [1] + ([0] * len(token_ids_0)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] + + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. + XLM-RoBERTa does not make use of token type ids, therefore a list of zeros is returned. + + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs. + token_ids_1 (:obj:`List[int]`, `optional`): + Optional second list of IDs for sequence pairs. + + Returns: + :obj:`List[int]`: List of zeros. + + """ + + sep = [self.sep_token_id] + cls = [self.cls_token_id] + + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + if not os.path.isdir(save_directory): + logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): + copyfile(self.vocab_file, out_vocab_file) + + return (out_vocab_file,) diff --git a/src/transformers/tokenization_xlnet.py b/src/transformers/tokenization_xlnet.py index f2484b2af0e..d41f7a5bc97 100644 --- a/src/transformers/tokenization_xlnet.py +++ b/src/transformers/tokenization_xlnet.py @@ -18,10 +18,12 @@ import os import unicodedata from shutil import copyfile -from typing import List, Optional +from typing import List, Optional, Tuple +import sentencepiece as spm + +from .file_utils import SPIECE_UNDERLINE from .tokenization_utils import PreTrainedTokenizer -from .tokenization_utils_fast import PreTrainedTokenizerFast from .utils import logging @@ -41,8 +43,6 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "xlnet-large-cased": None, } -SPIECE_UNDERLINE = "▁" - # Segments (not really needed) SEG_ID_A = 0 SEG_ID_B = 1 @@ -141,15 +141,6 @@ class XLNetTokenizer(PreTrainedTokenizer): self._pad_token_type_id = 3 - try: - import sentencepiece as spm - except ImportError: - logger.warning( - "You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece" - "pip install sentencepiece" - ) - raise - self.do_lower_case = do_lower_case self.remove_space = remove_space self.keep_accents = keep_accents @@ -174,14 +165,6 @@ class XLNetTokenizer(PreTrainedTokenizer): def __setstate__(self, d): self.__dict__ = d - try: - import sentencepiece as spm - except ImportError: - logger.warning( - "You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece" - "pip install sentencepiece" - ) - raise self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(self.vocab_file) @@ -325,232 +308,14 @@ class XLNetTokenizer(PreTrainedTokenizer): return len(token_ids_0 + sep) * [0] + cls_segment_id return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + cls_segment_id - def save_vocabulary(self, save_directory): - """ - Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory. - - Args: - save_directory (:obj:`str`): - The directory in which to save the vocabulary. - - Returns: - :obj:`Tuple(str)`: Paths to the files saved. - """ + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: if not os.path.isdir(save_directory): logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) return - out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"]) - - if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): - copyfile(self.vocab_file, out_vocab_file) - - return (out_vocab_file,) - - -class XLNetTokenizerFast(PreTrainedTokenizerFast): - """ - Construct a "fast" XLNet tokenizer (backed by HuggingFace's `tokenizers` library). Based on - `SentencePiece `__. - - This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main - methods. Users should refer to this superclass for more information regarding those methods. - - Args: - vocab_file (:obj:`str`): - `SentencePiece `__ file (generally has a .spm extension) that - contains the vocabulary necessary to instantiate a tokenizer. - do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`): - Whether to lowercase the input when tokenizing. - remove_space (:obj:`bool`, `optional`, defaults to :obj:`True`): - Whether to strip the text when tokenizing (removing excess spaces before and after the string). - keep_accents (:obj:`bool`, `optional`, defaults to :obj:`False`): - Whether to keep accents when tokenizing. - bos_token (:obj:`str`, `optional`, defaults to :obj:`""`): - The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. - - .. note:: - - When building a sequence using special tokens, this is not the token that is used for the beginning - of sequence. The token used is the :obj:`cls_token`. - eos_token (:obj:`str`, `optional`, defaults to :obj:`""`): - The end of sequence token. - - .. note:: - - When building a sequence using special tokens, this is not the token that is used for the end - of sequence. The token used is the :obj:`sep_token`. - unk_token (:obj:`str`, `optional`, defaults to :obj:`""`): - The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this - token instead. - sep_token (:obj:`str`, `optional`, defaults to :obj:`""`): - The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences - for sequence classification or for a text and a question for question answering. - It is also used as the last token of a sequence built with special tokens. - pad_token (:obj:`str`, `optional`, defaults to :obj:`""`): - The token used for padding, for example when batching sequences of different lengths. - cls_token (:obj:`str`, `optional`, defaults to :obj:`""`): - The classifier token which is used when doing sequence classification (classification of the whole - sequence instead of per-token classification). It is the first token of the sequence when built with - special tokens. - mask_token (:obj:`str`, `optional`, defaults to :obj:`""`): - The token used for masking values. This is the token used when training this model with masked language - modeling. This is the token which the model will try to predict. - additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["", ""]`): - Additional special tokens used by the tokenizer. - - Attributes: - sp_model (:obj:`SentencePieceProcessor`): - The `SentencePiece` processor that is used for every conversion (string, tokens and IDs). - """ - - vocab_files_names = VOCAB_FILES_NAMES - pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP - max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - padding_side = "left" - slow_tokenizer_class = XLNetTokenizer - - def __init__( - self, - vocab_file, - do_lower_case=False, - remove_space=True, - keep_accents=False, - bos_token="", - eos_token="", - unk_token="", - sep_token="", - pad_token="", - cls_token="", - mask_token="", - additional_special_tokens=["", ""], - **kwargs - ): - super().__init__( - vocab_file=vocab_file, - do_lower_case=do_lower_case, - remove_space=remove_space, - keep_accents=keep_accents, - bos_token=bos_token, - eos_token=eos_token, - unk_token=unk_token, - sep_token=sep_token, - pad_token=pad_token, - cls_token=cls_token, - mask_token=mask_token, - additional_special_tokens=additional_special_tokens, - **kwargs, + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] ) - self._pad_token_type_id = 3 - self.do_lower_case = do_lower_case - self.remove_space = remove_space - self.keep_accents = keep_accents - self.vocab_file = vocab_file - - def build_inputs_with_special_tokens( - self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None - ) -> List[int]: - """ - Build model inputs from a sequence or a pair of sequence for sequence classification tasks - by concatenating and adding special tokens. - An XLNet sequence has the following format: - - - single sequence: ``X `` - - pair of sequences: ``A B `` - - Args: - token_ids_0 (:obj:`List[int]`): - List of IDs to which the special tokens will be added. - token_ids_1 (:obj:`List[int]`, `optional`): - Optional second list of IDs for sequence pairs. - - Returns: - :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. - """ - sep = [self.sep_token_id] - cls = [self.cls_token_id] - if token_ids_1 is None: - return token_ids_0 + sep + cls - return token_ids_0 + sep + token_ids_1 + sep + cls - - def get_special_tokens_mask( - self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False - ) -> List[int]: - """ - Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding - special tokens using the tokenizer ``prepare_for_model`` method. - - Args: - token_ids_0 (:obj:`List[int]`): - List of IDs. - token_ids_1 (:obj:`List[int]`, `optional`): - Optional second list of IDs for sequence pairs. - already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): - Whether or not the token list is already formatted with special tokens for the model. - - Returns: - :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. - """ - - if already_has_special_tokens: - if token_ids_1 is not None: - raise ValueError( - "You should not supply a second sequence if the provided sequence of " - "ids is already formated with special tokens for the model." - ) - return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) - - if token_ids_1 is not None: - return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1, 1] - return ([0] * len(token_ids_0)) + [1, 1] - - def create_token_type_ids_from_sequences( - self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None - ) -> List[int]: - """ - Create a mask from the two sequences passed to be used in a sequence-pair classification task. - An XLNet sequence pair mask has the following format: - - :: - - 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 - | first sequence | second sequence | - - If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s). - - Args: - token_ids_0 (:obj:`List[int]`): - List of IDs. - token_ids_1 (:obj:`List[int]`, `optional`): - Optional second list of IDs for sequence pairs. - - Returns: - :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given - sequence(s). - """ - sep = [self.sep_token_id] - cls_segment_id = [2] - - if token_ids_1 is None: - return len(token_ids_0 + sep) * [0] + cls_segment_id - return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + cls_segment_id - - def save_vocabulary(self, save_directory): - """ - Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory. - - Args: - save_directory (:obj:`str`): - The directory in which to save the vocabulary. - - Returns: - :obj:`Tuple(str)`: Paths to the files saved. - """ - if not os.path.isdir(save_directory): - logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) - return - out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"]) - if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): copyfile(self.vocab_file, out_vocab_file) diff --git a/src/transformers/tokenization_xlnet_fast.py b/src/transformers/tokenization_xlnet_fast.py new file mode 100644 index 00000000000..ca762e34d97 --- /dev/null +++ b/src/transformers/tokenization_xlnet_fast.py @@ -0,0 +1,264 @@ +# coding=utf-8 +# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Tokenization classes for XLNet model.""" + + +import os +from shutil import copyfile +from typing import List, Optional, Tuple + +from .file_utils import is_sentencepiece_available +from .tokenization_utils_fast import PreTrainedTokenizerFast +from .utils import logging + + +if is_sentencepiece_available(): + from .tokenization_xlnet import XLNetTokenizer +else: + XLNetTokenizer = None + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"} + +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": { + "xlnet-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-spiece.model", + "xlnet-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-spiece.model", + }, + "tokenizer_file": { + "xlnet-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-tokenizer.json", + "xlnet-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-tokenizer.json", + }, +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + "xlnet-base-cased": None, + "xlnet-large-cased": None, +} + +SPIECE_UNDERLINE = "▁" + +# Segments (not really needed) +SEG_ID_A = 0 +SEG_ID_B = 1 +SEG_ID_CLS = 2 +SEG_ID_SEP = 3 +SEG_ID_PAD = 4 + + +class XLNetTokenizerFast(PreTrainedTokenizerFast): + """ + Construct a "fast" XLNet tokenizer (backed by HuggingFace's `tokenizers` library). Based on + `SentencePiece `__. + + This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main + methods. Users should refer to this superclass for more information regarding those methods. + + Args: + vocab_file (:obj:`str`): + `SentencePiece `__ file (generally has a .spm extension) that + contains the vocabulary necessary to instantiate a tokenizer. + do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether to lowercase the input when tokenizing. + remove_space (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether to strip the text when tokenizing (removing excess spaces before and after the string). + keep_accents (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether to keep accents when tokenizing. + bos_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. + + .. note:: + + When building a sequence using special tokens, this is not the token that is used for the beginning + of sequence. The token used is the :obj:`cls_token`. + eos_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The end of sequence token. + + .. note:: + + When building a sequence using special tokens, this is not the token that is used for the end + of sequence. The token used is the :obj:`sep_token`. + unk_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + sep_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences + for sequence classification or for a text and a question for question answering. + It is also used as the last token of a sequence built with special tokens. + pad_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The token used for padding, for example when batching sequences of different lengths. + cls_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The classifier token which is used when doing sequence classification (classification of the whole + sequence instead of per-token classification). It is the first token of the sequence when built with + special tokens. + mask_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["", ""]`): + Additional special tokens used by the tokenizer. + + Attributes: + sp_model (:obj:`SentencePieceProcessor`): + The `SentencePiece` processor that is used for every conversion (string, tokens and IDs). + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + padding_side = "left" + slow_tokenizer_class = XLNetTokenizer + + def __init__( + self, + vocab_file, + tokenizer_file=None, + do_lower_case=False, + remove_space=True, + keep_accents=False, + bos_token="", + eos_token="", + unk_token="", + sep_token="", + pad_token="", + cls_token="", + mask_token="", + additional_special_tokens=["", ""], + **kwargs + ): + super().__init__( + vocab_file=vocab_file, + tokenizer_file=tokenizer_file, + do_lower_case=do_lower_case, + remove_space=remove_space, + keep_accents=keep_accents, + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + additional_special_tokens=additional_special_tokens, + **kwargs, + ) + + self._pad_token_type_id = 3 + self.do_lower_case = do_lower_case + self.remove_space = remove_space + self.keep_accents = keep_accents + self.vocab_file = vocab_file + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks + by concatenating and adding special tokens. + An XLNet sequence has the following format: + + - single sequence: ``X `` + - pair of sequences: ``A B `` + + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (:obj:`List[int]`, `optional`): + Optional second list of IDs for sequence pairs. + + Returns: + :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + if token_ids_1 is None: + return token_ids_0 + sep + cls + return token_ids_0 + sep + token_ids_1 + sep + cls + + def get_special_tokens_mask( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False + ) -> List[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer ``prepare_for_model`` method. + + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs. + token_ids_1 (:obj:`List[int]`, `optional`): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + + if already_has_special_tokens: + if token_ids_1 is not None: + raise ValueError( + "You should not supply a second sequence if the provided sequence of " + "ids is already formated with special tokens for the model." + ) + return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) + + if token_ids_1 is not None: + return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1, 1] + return ([0] * len(token_ids_0)) + [1, 1] + + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. + An XLNet sequence pair mask has the following format: + + :: + + 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 + | first sequence | second sequence | + + If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s). + + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs. + token_ids_1 (:obj:`List[int]`, `optional`): + Optional second list of IDs for sequence pairs. + + Returns: + :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given + sequence(s). + """ + sep = [self.sep_token_id] + cls_segment_id = [2] + + if token_ids_1 is None: + return len(token_ids_0 + sep) * [0] + cls_segment_id + return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + cls_segment_id + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + if not os.path.isdir(save_directory): + logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): + copyfile(self.vocab_file, out_vocab_file) + + return (out_vocab_file,) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 5d471f73b72..930cbb4da2e 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -1425,7 +1425,7 @@ class Trainer: def floating_point_ops(self, inputs: Dict[str, Union[torch.Tensor, Any]]): """ - For models that inherit from :class:`~transformers.PretrainedModel`, uses + For models that inherit from :class:`~transformers.PreTrainedModel`, uses that method to compute the number of floating point operations for every backward + forward pass. If using another model, either implement such a method in the model or subclass and override this method. diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index e2c1154262b..0b8e7662f3a 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -1881,15 +1881,6 @@ def get_polynomial_decay_schedule_with_warmup(*args, **kwargs): requires_pytorch(get_polynomial_decay_schedule_with_warmup) -class MarianTokenizer: - def __init__(self, *args, **kwargs): - requires_pytorch(self) - - @classmethod - def from_pretrained(self, *args, **kwargs): - requires_pytorch(self) - - class Trainer: def __init__(self, *args, **kwargs): requires_pytorch(self) diff --git a/src/transformers/utils/dummy_sentencepiece_objects.py b/src/transformers/utils/dummy_sentencepiece_objects.py new file mode 100644 index 00000000000..46fdb3a46d9 --- /dev/null +++ b/src/transformers/utils/dummy_sentencepiece_objects.py @@ -0,0 +1,92 @@ +# This file is autogenerated by the command `make fix-copies`, do not edit. +from ..file_utils import requires_sentencepiece + + +class AlbertTokenizer: + def __init__(self, *args, **kwargs): + requires_sentencepiece(self) + + @classmethod + def from_pretrained(self, *args, **kwargs): + requires_sentencepiece(self) + + +class BertGenerationTokenizer: + def __init__(self, *args, **kwargs): + requires_sentencepiece(self) + + @classmethod + def from_pretrained(self, *args, **kwargs): + requires_sentencepiece(self) + + +class CamembertTokenizer: + def __init__(self, *args, **kwargs): + requires_sentencepiece(self) + + @classmethod + def from_pretrained(self, *args, **kwargs): + requires_sentencepiece(self) + + +class MarianTokenizer: + def __init__(self, *args, **kwargs): + requires_sentencepiece(self) + + @classmethod + def from_pretrained(self, *args, **kwargs): + requires_sentencepiece(self) + + +class MBartTokenizer: + def __init__(self, *args, **kwargs): + requires_sentencepiece(self) + + @classmethod + def from_pretrained(self, *args, **kwargs): + requires_sentencepiece(self) + + +class PegasusTokenizer: + def __init__(self, *args, **kwargs): + requires_sentencepiece(self) + + @classmethod + def from_pretrained(self, *args, **kwargs): + requires_sentencepiece(self) + + +class ReformerTokenizer: + def __init__(self, *args, **kwargs): + requires_sentencepiece(self) + + @classmethod + def from_pretrained(self, *args, **kwargs): + requires_sentencepiece(self) + + +class T5Tokenizer: + def __init__(self, *args, **kwargs): + requires_sentencepiece(self) + + @classmethod + def from_pretrained(self, *args, **kwargs): + requires_sentencepiece(self) + + +class XLMRobertaTokenizer: + def __init__(self, *args, **kwargs): + requires_sentencepiece(self) + + @classmethod + def from_pretrained(self, *args, **kwargs): + requires_sentencepiece(self) + + +class XLNetTokenizer: + def __init__(self, *args, **kwargs): + requires_sentencepiece(self) + + @classmethod + def from_pretrained(self, *args, **kwargs): + requires_sentencepiece(self) diff --git a/src/transformers/utils/dummy_tokenizers_objects.py b/src/transformers/utils/dummy_tokenizers_objects.py new file mode 100644 index 00000000000..a9ae88b371e --- /dev/null +++ b/src/transformers/utils/dummy_tokenizers_objects.py @@ -0,0 +1,252 @@ +# This file is autogenerated by the command `make fix-copies`, do not edit. +from ..file_utils import requires_tokenizers + + +class AlbertTokenizerFast: + def __init__(self, *args, **kwargs): + requires_tokenizers(self) + + @classmethod + def from_pretrained(self, *args, **kwargs): + requires_tokenizers(self) + + +class BartTokenizerFast: + def __init__(self, *args, **kwargs): + requires_tokenizers(self) + + @classmethod + def from_pretrained(self, *args, **kwargs): + requires_tokenizers(self) + + +class BertTokenizerFast: + def __init__(self, *args, **kwargs): + requires_tokenizers(self) + + @classmethod + def from_pretrained(self, *args, **kwargs): + requires_tokenizers(self) + + +class CamembertTokenizerFast: + def __init__(self, *args, **kwargs): + requires_tokenizers(self) + + @classmethod + def from_pretrained(self, *args, **kwargs): + requires_tokenizers(self) + + +class DistilBertTokenizerFast: + def __init__(self, *args, **kwargs): + requires_tokenizers(self) + + @classmethod + def from_pretrained(self, *args, **kwargs): + requires_tokenizers(self) + + +class DPRContextEncoderTokenizerFast: + def __init__(self, *args, **kwargs): + requires_tokenizers(self) + + @classmethod + def from_pretrained(self, *args, **kwargs): + requires_tokenizers(self) + + +class DPRQuestionEncoderTokenizerFast: + def __init__(self, *args, **kwargs): + requires_tokenizers(self) + + @classmethod + def from_pretrained(self, *args, **kwargs): + requires_tokenizers(self) + + +class DPRReaderTokenizerFast: + def __init__(self, *args, **kwargs): + requires_tokenizers(self) + + @classmethod + def from_pretrained(self, *args, **kwargs): + requires_tokenizers(self) + + +class ElectraTokenizerFast: + def __init__(self, *args, **kwargs): + requires_tokenizers(self) + + @classmethod + def from_pretrained(self, *args, **kwargs): + requires_tokenizers(self) + + +class FunnelTokenizerFast: + def __init__(self, *args, **kwargs): + requires_tokenizers(self) + + @classmethod + def from_pretrained(self, *args, **kwargs): + requires_tokenizers(self) + + +class GPT2TokenizerFast: + def __init__(self, *args, **kwargs): + requires_tokenizers(self) + + @classmethod + def from_pretrained(self, *args, **kwargs): + requires_tokenizers(self) + + +class HerbertTokenizerFast: + def __init__(self, *args, **kwargs): + requires_tokenizers(self) + + @classmethod + def from_pretrained(self, *args, **kwargs): + requires_tokenizers(self) + + +class LayoutLMTokenizerFast: + def __init__(self, *args, **kwargs): + requires_tokenizers(self) + + @classmethod + def from_pretrained(self, *args, **kwargs): + requires_tokenizers(self) + + +class LongformerTokenizerFast: + def __init__(self, *args, **kwargs): + requires_tokenizers(self) + + @classmethod + def from_pretrained(self, *args, **kwargs): + requires_tokenizers(self) + + +class LxmertTokenizerFast: + def __init__(self, *args, **kwargs): + requires_tokenizers(self) + + @classmethod + def from_pretrained(self, *args, **kwargs): + requires_tokenizers(self) + + +class MBartTokenizerFast: + def __init__(self, *args, **kwargs): + requires_tokenizers(self) + + @classmethod + def from_pretrained(self, *args, **kwargs): + requires_tokenizers(self) + + +class MobileBertTokenizerFast: + def __init__(self, *args, **kwargs): + requires_tokenizers(self) + + @classmethod + def from_pretrained(self, *args, **kwargs): + requires_tokenizers(self) + + +class OpenAIGPTTokenizerFast: + def __init__(self, *args, **kwargs): + requires_tokenizers(self) + + @classmethod + def from_pretrained(self, *args, **kwargs): + requires_tokenizers(self) + + +class PegasusTokenizerFast: + def __init__(self, *args, **kwargs): + requires_tokenizers(self) + + @classmethod + def from_pretrained(self, *args, **kwargs): + requires_tokenizers(self) + + +class ReformerTokenizerFast: + def __init__(self, *args, **kwargs): + requires_tokenizers(self) + + @classmethod + def from_pretrained(self, *args, **kwargs): + requires_tokenizers(self) + + +class RetriBertTokenizerFast: + def __init__(self, *args, **kwargs): + requires_tokenizers(self) + + @classmethod + def from_pretrained(self, *args, **kwargs): + requires_tokenizers(self) + + +class RobertaTokenizerFast: + def __init__(self, *args, **kwargs): + requires_tokenizers(self) + + @classmethod + def from_pretrained(self, *args, **kwargs): + requires_tokenizers(self) + + +class SqueezeBertTokenizerFast: + def __init__(self, *args, **kwargs): + requires_tokenizers(self) + + @classmethod + def from_pretrained(self, *args, **kwargs): + requires_tokenizers(self) + + +class T5TokenizerFast: + def __init__(self, *args, **kwargs): + requires_tokenizers(self) + + @classmethod + def from_pretrained(self, *args, **kwargs): + requires_tokenizers(self) + + +class PreTrainedTokenizerFast: + def __init__(self, *args, **kwargs): + requires_tokenizers(self) + + @classmethod + def from_pretrained(self, *args, **kwargs): + requires_tokenizers(self) + + +class XLMRobertaTokenizerFast: + def __init__(self, *args, **kwargs): + requires_tokenizers(self) + + @classmethod + def from_pretrained(self, *args, **kwargs): + requires_tokenizers(self) + + +class XLNetTokenizerFast: + def __init__(self, *args, **kwargs): + requires_tokenizers(self) + + @classmethod + def from_pretrained(self, *args, **kwargs): + requires_tokenizers(self) + + +SLOW_TO_FAST_CONVERTERS = None + + +def convert_slow_tokenizer(*args, **kwargs): + requires_tokenizers(convert_slow_tokenizer) diff --git a/templates/adding_a_new_model/tokenization_xxx.py b/templates/adding_a_new_model/tokenization_xxx.py index 60fbc2c341c..94b9eba7d5c 100644 --- a/templates/adding_a_new_model/tokenization_xxx.py +++ b/templates/adding_a_new_model/tokenization_xxx.py @@ -18,7 +18,7 @@ import collections import logging import os -from typing import List, Optional +from typing import List, Optional, Tuple from .tokenization_utils import PreTrainedTokenizer @@ -275,22 +275,14 @@ class XxxTokenizer(PreTrainedTokenizer): return len(cls + token_ids_0 + sep) * [0] return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] - def save_vocabulary(self, vocab_path): - """ - Save the vocabulary (copy original file) and special tokens file to a directory. - - Args: - vocab_path (:obj:`str`): - The directory in which to save the vocabulary. - - Returns: - :obj:`Tuple(str)`: Paths to the files saved. - """ + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: index = 0 - if os.path.isdir(vocab_path): - vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["vocab_file"]) + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) else: - vocab_file = vocab_path + vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory with open(vocab_file, "w", encoding="utf-8") as writer: for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): if index != token_index: diff --git a/tests/fixtures/sample_text_no_unicode.txt b/tests/fixtures/sample_text_no_unicode.txt new file mode 100644 index 00000000000..74646661c7c --- /dev/null +++ b/tests/fixtures/sample_text_no_unicode.txt @@ -0,0 +1,32 @@ +Text should be one-sentence-per-line, with empty lines between documents. +This sample text is public domain and was randomly selected from Project Guttenberg. + +The rain had only ceased with the gray streaks of morning at Blazing Star, and the settlement awoke to a moral sense of cleanliness, and the finding of forgotten knives, tin cups, and smaller camp utensils, where the heavy showers had washed away the debris and dust heaps before the cabin doors. +Indeed, it was recorded in Blazing Star that a fortunate early riser had once picked up on the highway a solid chunk of gold quartz which the rain had freed from its incumbering soil, and washed into immediate and glittering popularity. +Possibly this may have been the reason why early risers in that locality, during the rainy season, adopted a thoughtful habit of body, and seldom lifted their eyes to the rifted or india-ink washed skies above them. +"Cass" Beard had risen early that morning, but not with a view to discovery. +A leak in his cabin roof,--quite consistent with his careless, improvident habits,--had roused him at 4 A. M., with a flooded "bunk" and wet blankets. +The chips from his wood pile refused to kindle a fire to dry his bed-clothes, and he had recourse to a more provident neighbor's to supply the deficiency. +This was nearly opposite. +Mr. Cassius crossed the highway, and stopped suddenly. +Something glittered in the nearest red pool before him. +Gold, surely! +But, wonderful to relate, not an irregular, shapeless fragment of crude ore, fresh from Nature's crucible, but a bit of jeweler's handicraft in the form of a plain gold ring. +Looking at it more attentively, he saw that it bore the inscription, "May to Cass." +Like most of his fellow gold-seekers, Cass was superstitious. + +The fountain of classic wisdom, Hypatia herself. +As the ancient sage--the name is unimportant to a monk--pumped water nightly that he might study by day, so I, the guardian of cloaks and parasols, at the sacred doors of her lecture-room, imbibe celestial knowledge. +From my youth I felt in me a soul above the matter-entangled herd. +She revealed to me the glorious fact, that I am a spark of Divinity itself. +A fallen star, I am, sir!' continued he, pensively, stroking his lean stomach--'a fallen star!--fallen, if the dignity of philosophy will allow of the simile, among the hogs of the lower world--indeed, even into the hog-bucket itself. Well, after all, I will show you the way to the Archbishop's. +There is a philosophic pleasure in opening one's treasures to the modest young. +Perhaps you will assist me by carrying this basket of fruit?' And the little man jumped up, put his basket on Philammon's head, and trotted off up a neighbouring street. +Philammon followed, half contemptuous, half wondering at what this philosophy might be, which could feed the self-conceit of anything so abject as his ragged little apish guide; +but the novel roar and whirl of the street, the perpetual stream of busy faces, the line of curricles, palanquins, laden asses, camels, elephants, which met and passed him, and squeezed him up steps and into doorways, as they threaded their way through the great Moon-gate into the ample street beyond, drove everything from his mind but wondering curiosity, and a vague, helpless dread of that great living wilderness, more terrible than any dead wilderness of sand which he had left behind. +Already he longed for the repose, the silence of the Laura--for faces which knew him and smiled upon him; but it was too late to turn back now. +His guide held on for more than a mile up the great main street, crossed in the centre of the city, at right angles, by one equally magnificent, at each end of which, miles away, appeared, dim and distant over the heads of the living stream of passengers, the yellow sand-hills of the desert; +while at the end of the vista in front of them gleamed the blue harbour, through a network of countless masts. +At last they reached the quay at the opposite end of the street; +and there burst on Philammon's astonished eyes a vast semicircle of blue sea, ringed with palaces and towers. +He stopped involuntarily; and his little guide stopped also, and looked askance at the young monk, to watch the effect which that grand panorama should produce on him. diff --git a/tests/test_modeling_bart.py b/tests/test_modeling_bart.py index 009dabf096e..aed3495a6de 100644 --- a/tests/test_modeling_bart.py +++ b/tests/test_modeling_bart.py @@ -20,7 +20,7 @@ import timeout_decorator # noqa from transformers import is_torch_available from transformers.file_utils import cached_property -from transformers.testing_utils import require_torch, slow, torch_device +from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device from .test_configuration_common import ConfigTester from .test_modeling_common import ModelTesterMixin, ids_tensor @@ -207,6 +207,8 @@ class BARTModelTest(ModelTesterMixin, unittest.TestCase): def test_inputs_embeds(self): pass + @require_sentencepiece + @require_tokenizers def test_tiny_model(self): model_name = "sshleifer/bart-tiny-random" tiny = AutoModel.from_pretrained(model_name) # same vocab size @@ -439,6 +441,8 @@ TOLERANCE = 1e-4 @require_torch +@require_sentencepiece +@require_tokenizers class BartModelIntegrationTests(unittest.TestCase): @cached_property def default_tokenizer(self): diff --git a/tests/test_modeling_blenderbot.py b/tests/test_modeling_blenderbot.py index 768e16f5151..e3581783468 100644 --- a/tests/test_modeling_blenderbot.py +++ b/tests/test_modeling_blenderbot.py @@ -19,7 +19,7 @@ import unittest from transformers import is_torch_available from transformers.file_utils import cached_property -from transformers.testing_utils import require_torch, slow, torch_device +from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device from .test_configuration_common import ConfigTester from .test_modeling_common import ModelTesterMixin, ids_tensor @@ -131,6 +131,8 @@ class BlenderbotTesterMixin(ModelTesterMixin, unittest.TestCase): @unittest.skipUnless(torch_device != "cpu", "3B test too slow on CPU.") @require_torch +@require_sentencepiece +@require_tokenizers class Blenderbot3BIntegrationTests(unittest.TestCase): ckpt = "facebook/blenderbot-3B" diff --git a/tests/test_modeling_camembert.py b/tests/test_modeling_camembert.py index f278d722169..41b0626e5bd 100644 --- a/tests/test_modeling_camembert.py +++ b/tests/test_modeling_camembert.py @@ -16,7 +16,7 @@ import unittest from transformers import is_torch_available -from transformers.testing_utils import require_torch, slow, torch_device +from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device if is_torch_available(): @@ -26,6 +26,8 @@ if is_torch_available(): @require_torch +@require_sentencepiece +@require_tokenizers class CamembertModelIntegrationTest(unittest.TestCase): @slow def test_output_embeds_base_model(self): diff --git a/tests/test_modeling_deberta.py b/tests/test_modeling_deberta.py index e6ec2417270..28ded3cf974 100644 --- a/tests/test_modeling_deberta.py +++ b/tests/test_modeling_deberta.py @@ -20,7 +20,7 @@ import unittest import numpy as np from transformers import is_torch_available -from transformers.testing_utils import require_torch, slow, torch_device +from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device from .test_configuration_common import ConfigTester from .test_modeling_common import ModelTesterMixin, ids_tensor @@ -236,6 +236,8 @@ class DebertaModelTest(ModelTesterMixin, unittest.TestCase): @require_torch +@require_sentencepiece +@require_tokenizers class DebertaModelIntegrationTest(unittest.TestCase): @unittest.skip(reason="Model not available yet") def test_inference_masked_lm(self): diff --git a/tests/test_modeling_fsmt.py b/tests/test_modeling_fsmt.py index b7c7a9f954f..d4b5653f744 100644 --- a/tests/test_modeling_fsmt.py +++ b/tests/test_modeling_fsmt.py @@ -22,7 +22,7 @@ import timeout_decorator # noqa from parameterized import parameterized from transformers import is_torch_available from transformers.file_utils import WEIGHTS_NAME, cached_property -from transformers.testing_utils import require_torch, slow, torch_device +from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device from .test_configuration_common import ConfigTester from .test_modeling_common import ModelTesterMixin, ids_tensor @@ -393,6 +393,8 @@ pairs = [ @require_torch +@require_sentencepiece +@require_tokenizers class FSMTModelIntegrationTests(unittest.TestCase): tokenizers_cache = {} models_cache = {} diff --git a/tests/test_modeling_funnel.py b/tests/test_modeling_funnel.py index 6d126478575..1b59cc93fb2 100644 --- a/tests/test_modeling_funnel.py +++ b/tests/test_modeling_funnel.py @@ -17,7 +17,7 @@ import unittest from transformers import FunnelTokenizer, is_torch_available -from transformers.testing_utils import require_torch, slow, torch_device +from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device from .test_configuration_common import ConfigTester from .test_modeling_common import ModelTesterMixin, ids_tensor @@ -417,6 +417,8 @@ class FunnelBaseModelTest(ModelTesterMixin, unittest.TestCase): @require_torch +@require_sentencepiece +@require_tokenizers class FunnelModelIntegrationTest(unittest.TestCase): def test_inference_tiny_model(self): batch_size = 13 diff --git a/tests/test_modeling_longformer.py b/tests/test_modeling_longformer.py index 85430b0fd8f..7acc84b1c82 100644 --- a/tests/test_modeling_longformer.py +++ b/tests/test_modeling_longformer.py @@ -17,7 +17,7 @@ import unittest from transformers import is_torch_available -from transformers.testing_utils import require_torch, slow, torch_device +from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device from .test_configuration_common import ConfigTester from .test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask @@ -329,6 +329,8 @@ class LongformerModelTest(ModelTesterMixin, unittest.TestCase): @require_torch +@require_sentencepiece +@require_tokenizers class LongformerModelIntegrationTest(unittest.TestCase): def _get_hidden_states(self): return torch.tensor( diff --git a/tests/test_modeling_marian.py b/tests/test_modeling_marian.py index a2641114573..d139c1d4871 100644 --- a/tests/test_modeling_marian.py +++ b/tests/test_modeling_marian.py @@ -19,7 +19,7 @@ import unittest from transformers import is_torch_available from transformers.file_utils import cached_property from transformers.hf_api import HfApi -from transformers.testing_utils import require_torch, slow, torch_device +from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device if is_torch_available(): @@ -53,6 +53,8 @@ class ModelManagementTests(unittest.TestCase): @require_torch +@require_sentencepiece +@require_tokenizers class MarianIntegrationTest(unittest.TestCase): src = "en" tgt = "de" @@ -110,6 +112,8 @@ class MarianIntegrationTest(unittest.TestCase): return generated_words +@require_sentencepiece +@require_tokenizers class TestMarian_EN_DE_More(MarianIntegrationTest): @slow def test_forward(self): @@ -154,6 +158,8 @@ class TestMarian_EN_DE_More(MarianIntegrationTest): self.assertIsInstance(config, MarianConfig) +@require_sentencepiece +@require_tokenizers class TestMarian_EN_FR(MarianIntegrationTest): src = "en" tgt = "fr" @@ -171,6 +177,8 @@ class TestMarian_EN_FR(MarianIntegrationTest): self._assert_generated_batch_equal_expected() +@require_sentencepiece +@require_tokenizers class TestMarian_FR_EN(MarianIntegrationTest): src = "fr" tgt = "en" @@ -188,6 +196,8 @@ class TestMarian_FR_EN(MarianIntegrationTest): self._assert_generated_batch_equal_expected() +@require_sentencepiece +@require_tokenizers class TestMarian_RU_FR(MarianIntegrationTest): src = "ru" tgt = "fr" @@ -199,6 +209,8 @@ class TestMarian_RU_FR(MarianIntegrationTest): self._assert_generated_batch_equal_expected() +@require_sentencepiece +@require_tokenizers class TestMarian_MT_EN(MarianIntegrationTest): src = "mt" tgt = "en" @@ -210,6 +222,8 @@ class TestMarian_MT_EN(MarianIntegrationTest): self._assert_generated_batch_equal_expected() +@require_sentencepiece +@require_tokenizers class TestMarian_en_zh(MarianIntegrationTest): src = "en" tgt = "zh" @@ -221,6 +235,8 @@ class TestMarian_en_zh(MarianIntegrationTest): self._assert_generated_batch_equal_expected() +@require_sentencepiece +@require_tokenizers class TestMarian_en_ROMANCE(MarianIntegrationTest): """Multilingual on target side.""" diff --git a/tests/test_modeling_mbart.py b/tests/test_modeling_mbart.py index 0730da374b1..36034b4eae0 100644 --- a/tests/test_modeling_mbart.py +++ b/tests/test_modeling_mbart.py @@ -2,7 +2,7 @@ import unittest from transformers import is_torch_available from transformers.file_utils import cached_property -from transformers.testing_utils import require_torch, slow, torch_device +from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device from .test_modeling_bart import TOLERANCE, _long_tensor, assert_tensors_close @@ -24,6 +24,8 @@ RO_CODE = 250020 @require_torch +@require_sentencepiece +@require_tokenizers class AbstractSeq2SeqIntegrationTest(unittest.TestCase): maxDiff = 1000 # longer string compare tracebacks checkpoint_name = None @@ -43,6 +45,8 @@ class AbstractSeq2SeqIntegrationTest(unittest.TestCase): @require_torch +@require_sentencepiece +@require_tokenizers class MBartEnroIntegrationTest(AbstractSeq2SeqIntegrationTest): checkpoint_name = "facebook/mbart-large-en-ro" src_text = [ @@ -134,6 +138,8 @@ class MBartEnroIntegrationTest(AbstractSeq2SeqIntegrationTest): @require_torch +@require_sentencepiece +@require_tokenizers class MBartCC25IntegrationTest(AbstractSeq2SeqIntegrationTest): checkpoint_name = "facebook/mbart-large-cc25" src_text = [ diff --git a/tests/test_modeling_mobilebert.py b/tests/test_modeling_mobilebert.py index 149494d20aa..c586858f102 100644 --- a/tests/test_modeling_mobilebert.py +++ b/tests/test_modeling_mobilebert.py @@ -17,7 +17,7 @@ import unittest from transformers import is_torch_available -from transformers.testing_utils import require_torch, slow, torch_device +from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device from .test_configuration_common import ConfigTester from .test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask @@ -411,6 +411,8 @@ TOLERANCE = 1e-3 @require_torch +@require_sentencepiece +@require_tokenizers class MobileBertModelIntegrationTests(unittest.TestCase): @slow def test_inference_no_head(self): diff --git a/tests/test_modeling_pegasus.py b/tests/test_modeling_pegasus.py index 61880e66871..55673bdd272 100644 --- a/tests/test_modeling_pegasus.py +++ b/tests/test_modeling_pegasus.py @@ -3,7 +3,7 @@ import unittest from transformers import AutoConfig, AutoTokenizer, is_torch_available from transformers.configuration_pegasus import task_specific_params from transformers.file_utils import cached_property -from transformers.testing_utils import require_torch, slow, torch_device +from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device from transformers.utils.logging import ERROR, set_verbosity from .test_modeling_bart import PGE_ARTICLE @@ -19,6 +19,8 @@ set_verbosity(ERROR) @require_torch +@require_sentencepiece +@require_tokenizers class PegasusXSUMIntegrationTest(AbstractSeq2SeqIntegrationTest): checkpoint_name = "google/pegasus-xsum" src_text = [PGE_ARTICLE, XSUM_ENTRY_LONGER] diff --git a/tests/test_modeling_rag.py b/tests/test_modeling_rag.py index 1a013dcee7d..b4dfea9b802 100644 --- a/tests/test_modeling_rag.py +++ b/tests/test_modeling_rag.py @@ -23,13 +23,12 @@ from unittest.mock import patch import numpy as np +from transformers import BartTokenizer, T5Tokenizer from transformers.file_utils import cached_property, is_datasets_available, is_faiss_available, is_torch_available -from transformers.testing_utils import require_torch, slow, torch_device -from transformers.tokenization_bart import BartTokenizer +from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device from transformers.tokenization_bert import VOCAB_FILES_NAMES as DPR_VOCAB_FILES_NAMES from transformers.tokenization_dpr import DPRQuestionEncoderTokenizer from transformers.tokenization_roberta import VOCAB_FILES_NAMES as BART_VOCAB_FILES_NAMES -from transformers.tokenization_t5 import T5Tokenizer from .test_modeling_bart import ModelTester as BartModelTester from .test_modeling_dpr import DPRModelTester @@ -89,6 +88,7 @@ def require_retrieval(test_case): @require_torch @require_retrieval +@require_sentencepiece class RagTestMixin: all_model_classes = ( @@ -438,6 +438,8 @@ class RagDPRT5Test(RagTestMixin, unittest.TestCase): @require_torch @require_retrieval +@require_sentencepiece +@require_tokenizers class RagModelIntegrationTests(unittest.TestCase): @cached_property def sequence_model(self): diff --git a/tests/test_modeling_reformer.py b/tests/test_modeling_reformer.py index 14aa6550be1..e454d06685f 100644 --- a/tests/test_modeling_reformer.py +++ b/tests/test_modeling_reformer.py @@ -16,7 +16,14 @@ import unittest from transformers import is_torch_available -from transformers.testing_utils import require_multigpu, require_torch, slow, torch_device +from transformers.testing_utils import ( + require_multigpu, + require_sentencepiece, + require_tokenizers, + require_torch, + slow, + torch_device, +) from .test_configuration_common import ConfigTester from .test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask @@ -680,6 +687,8 @@ class ReformerLSHAttnModelTest(ReformerTesterMixin, ModelTesterMixin, unittest.T @require_torch +@require_sentencepiece +@require_tokenizers class ReformerIntegrationTests(unittest.TestCase): """ These integration tests test the current layer activations and gradients againts the output of the Hugging Face Reformer model at time of integration: 29/06/2020. During integration, the model was tested against the output of the official Trax ReformerLM model for various cases ("lsh" only, "local" only, masked / non-masked, different chunk length, ....). In order to recover the original trax integration tests, one should use patrickvonplaten's fork of trax and the code that lives on the branch `reformer_trax_tests`. diff --git a/tests/test_modeling_roberta.py b/tests/test_modeling_roberta.py index cd1aa3a7ac7..5c0ec2709f3 100644 --- a/tests/test_modeling_roberta.py +++ b/tests/test_modeling_roberta.py @@ -17,7 +17,7 @@ import unittest from transformers import is_torch_available -from transformers.testing_utils import require_torch, slow, torch_device +from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device from .test_configuration_common import ConfigTester from .test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask @@ -394,6 +394,8 @@ class RobertaModelTest(ModelTesterMixin, unittest.TestCase): self.assertTrue(torch.all(torch.eq(position_ids, expected_positions))) +@require_sentencepiece +@require_tokenizers class RobertaModelIntegrationTest(unittest.TestCase): @slow def test_inference_masked_lm(self): diff --git a/tests/test_modeling_squeezebert.py b/tests/test_modeling_squeezebert.py index 58b08b2b6af..966f4771c58 100644 --- a/tests/test_modeling_squeezebert.py +++ b/tests/test_modeling_squeezebert.py @@ -17,7 +17,7 @@ import unittest from transformers import is_torch_available -from transformers.testing_utils import require_torch, slow, torch_device +from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device from .test_configuration_common import ConfigTester from .test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask @@ -271,6 +271,8 @@ class SqueezeBertModelTest(ModelTesterMixin, unittest.TestCase): self.assertIsNotNone(model) +@require_sentencepiece +@require_tokenizers class SqueezeBertModelIntegrationTest(unittest.TestCase): @slow def test_inference_classification_head(self): diff --git a/tests/test_modeling_t5.py b/tests/test_modeling_t5.py index 8d14cc72442..8411032c387 100644 --- a/tests/test_modeling_t5.py +++ b/tests/test_modeling_t5.py @@ -20,7 +20,7 @@ import unittest from transformers import is_torch_available from transformers.file_utils import cached_property -from transformers.testing_utils import require_torch, slow, torch_device +from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device from .test_configuration_common import ConfigTester from .test_modeling_common import ModelTesterMixin, ids_tensor @@ -29,9 +29,8 @@ from .test_modeling_common import ModelTesterMixin, ids_tensor if is_torch_available(): import torch - from transformers import T5Config, T5ForConditionalGeneration, T5Model + from transformers import T5Config, T5ForConditionalGeneration, T5Model, T5Tokenizer from transformers.modeling_t5 import T5_PRETRAINED_MODEL_ARCHIVE_LIST - from transformers.tokenization_t5 import T5Tokenizer class T5ModelTester: @@ -546,6 +545,8 @@ def use_task_specific_params(model, task): @require_torch +@require_sentencepiece +@require_tokenizers class T5ModelIntegrationTests(unittest.TestCase): @cached_property def model(self): diff --git a/tests/test_modeling_tf_camembert.py b/tests/test_modeling_tf_camembert.py index 865fc3be081..cfd96fe56e9 100644 --- a/tests/test_modeling_tf_camembert.py +++ b/tests/test_modeling_tf_camembert.py @@ -16,7 +16,7 @@ import unittest from transformers import is_tf_available -from transformers.testing_utils import require_tf, slow +from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow if is_tf_available(): @@ -27,6 +27,8 @@ if is_tf_available(): @require_tf +@require_sentencepiece +@require_tokenizers class TFCamembertModelIntegrationTest(unittest.TestCase): @slow def test_output_embeds_base_model(self): diff --git a/tests/test_modeling_tf_flaubert.py b/tests/test_modeling_tf_flaubert.py index dbbdc15b2aa..1d7a533067c 100644 --- a/tests/test_modeling_tf_flaubert.py +++ b/tests/test_modeling_tf_flaubert.py @@ -16,7 +16,7 @@ import unittest from transformers import is_tf_available -from transformers.testing_utils import require_tf, slow +from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow from .test_configuration_common import ConfigTester from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor @@ -332,6 +332,8 @@ class TFFlaubertModelTest(TFModelTesterMixin, unittest.TestCase): @require_tf +@require_sentencepiece +@require_tokenizers class TFFlaubertModelIntegrationTest(unittest.TestCase): @slow def test_output_embeds_base_model(self): diff --git a/tests/test_modeling_tf_longformer.py b/tests/test_modeling_tf_longformer.py index b0bd9bb260c..75e9355eb56 100644 --- a/tests/test_modeling_tf_longformer.py +++ b/tests/test_modeling_tf_longformer.py @@ -17,7 +17,7 @@ import unittest from transformers import is_tf_available -from transformers.testing_utils import require_tf, slow +from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow from .test_configuration_common import ConfigTester from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor @@ -304,6 +304,8 @@ class TFLongformerModelTest(TFModelTesterMixin, unittest.TestCase): @require_tf +@require_sentencepiece +@require_tokenizers class TFLongformerModelIntegrationTest(unittest.TestCase): def _get_hidden_states(self): return tf.convert_to_tensor( diff --git a/tests/test_modeling_tf_roberta.py b/tests/test_modeling_tf_roberta.py index 9a4d0b037df..c79e8799d7a 100644 --- a/tests/test_modeling_tf_roberta.py +++ b/tests/test_modeling_tf_roberta.py @@ -17,7 +17,7 @@ import unittest from transformers import RobertaConfig, is_tf_available -from transformers.testing_utils import require_tf, slow +from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow from .test_configuration_common import ConfigTester from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor @@ -222,6 +222,8 @@ class TFRobertaModelTest(TFModelTesterMixin, unittest.TestCase): @require_tf +@require_sentencepiece +@require_tokenizers class TFRobertaModelIntegrationTest(unittest.TestCase): @slow def test_inference_masked_lm(self): diff --git a/tests/test_modeling_tf_t5.py b/tests/test_modeling_tf_t5.py index f3c0cb81cd2..7a3a091f680 100644 --- a/tests/test_modeling_tf_t5.py +++ b/tests/test_modeling_tf_t5.py @@ -18,7 +18,7 @@ import unittest from transformers import T5Config, is_tf_available from transformers.file_utils import cached_property -from transformers.testing_utils import require_tf, slow +from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow from .test_configuration_common import ConfigTester from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor @@ -285,6 +285,8 @@ class TFT5ModelTest(TFModelTesterMixin, unittest.TestCase): @require_tf +@require_sentencepiece +@require_tokenizers class TFT5ModelIntegrationTests(unittest.TestCase): @cached_property def model(self): diff --git a/tests/test_modeling_tf_xlm_roberta.py b/tests/test_modeling_tf_xlm_roberta.py index 4092c2adf3f..b67d42db4e5 100644 --- a/tests/test_modeling_tf_xlm_roberta.py +++ b/tests/test_modeling_tf_xlm_roberta.py @@ -16,7 +16,7 @@ import unittest from transformers import is_tf_available -from transformers.testing_utils import require_tf, slow +from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow if is_tf_available(): @@ -27,6 +27,8 @@ if is_tf_available(): @require_tf +@require_sentencepiece +@require_tokenizers class TFFlaubertModelIntegrationTest(unittest.TestCase): @slow def test_output_embeds_base_model(self): diff --git a/tests/test_modeling_xlm_roberta.py b/tests/test_modeling_xlm_roberta.py index 8036c492316..abafeec6ef2 100644 --- a/tests/test_modeling_xlm_roberta.py +++ b/tests/test_modeling_xlm_roberta.py @@ -17,7 +17,7 @@ import unittest from transformers import is_torch_available -from transformers.testing_utils import slow +from transformers.testing_utils import require_sentencepiece, require_tokenizers, slow if is_torch_available(): @@ -26,6 +26,8 @@ if is_torch_available(): from transformers import XLMRobertaModel +@require_sentencepiece +@require_tokenizers class XLMRobertaModelIntegrationTest(unittest.TestCase): @slow def test_xlm_roberta_base(self): diff --git a/tests/test_onnx.py b/tests/test_onnx.py index 6308bc523dc..17578ed670d 100644 --- a/tests/test_onnx.py +++ b/tests/test_onnx.py @@ -10,7 +10,7 @@ from transformers.convert_graph_to_onnx import ( infer_shapes, quantize, ) -from transformers.testing_utils import require_tf, require_torch, slow +from transformers.testing_utils import require_tf, require_tokenizers, require_torch, slow class FuncContiguousArgs: @@ -94,6 +94,7 @@ class OnnxExportTestCase(unittest.TestCase): self.fail(e) @require_torch + @require_tokenizers def test_infer_dynamic_axis_pytorch(self): """ Validate the dynamic axis generated for each parameters are correct @@ -105,6 +106,7 @@ class OnnxExportTestCase(unittest.TestCase): self._test_infer_dynamic_axis(model, tokenizer, "pt") @require_tf + @require_tokenizers def test_infer_dynamic_axis_tf(self): """ Validate the dynamic axis generated for each parameters are correct diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py index e9ada812da0..b9635ccba6b 100644 --- a/tests/test_pipelines.py +++ b/tests/test_pipelines.py @@ -3,7 +3,7 @@ from typing import Iterable, List, Optional from transformers import pipeline from transformers.pipelines import SUPPORTED_TASKS, Conversation, DefaultArgumentHandler, Pipeline -from transformers.testing_utils import require_tf, require_torch, slow, torch_device +from transformers.testing_utils import require_tf, require_tokenizers, require_torch, slow, torch_device DEFAULT_DEVICE_NUM = -1 if torch_device == "cpu" else 0 @@ -342,6 +342,7 @@ class MonoColumnInputTestCase(unittest.TestCase): ) @require_torch + @require_tokenizers def test_torch_summarization(self): invalid_inputs = [4, ""] mandatory_keys = ["summary_text"] @@ -377,6 +378,7 @@ class MonoColumnInputTestCase(unittest.TestCase): ) @require_torch + @require_tokenizers def test_torch_translation(self): invalid_inputs = [4, ""] mandatory_keys = ["translation_text"] @@ -399,6 +401,7 @@ class MonoColumnInputTestCase(unittest.TestCase): self._test_mono_column_pipeline(nlp, VALID_INPUTS, mandatory_keys, invalid_inputs=invalid_inputs) @require_torch + @require_tokenizers def test_torch_text2text(self): invalid_inputs = [4, ""] mandatory_keys = ["generated_text"] diff --git a/tests/test_retrieval_rag.py b/tests/test_retrieval_rag.py index 1fa1cadb8f2..0f7e03e2f8d 100644 --- a/tests/test_retrieval_rag.py +++ b/tests/test_retrieval_rag.py @@ -14,7 +14,13 @@ from transformers.configuration_bart import BartConfig from transformers.configuration_dpr import DPRConfig from transformers.configuration_rag import RagConfig from transformers.retrieval_rag import RagRetriever -from transformers.testing_utils import require_datasets, require_faiss, require_torch +from transformers.testing_utils import ( + require_datasets, + require_faiss, + require_sentencepiece, + require_tokenizers, + require_torch, +) from transformers.tokenization_bart import BartTokenizer from transformers.tokenization_bert import VOCAB_FILES_NAMES as DPR_VOCAB_FILES_NAMES from transformers.tokenization_dpr import DPRQuestionEncoderTokenizer @@ -189,6 +195,8 @@ class RagRetrieverTest(TestCase): self.assertListEqual(doc_ids.tolist(), [[1], [0]]) @require_torch + @require_tokenizers + @require_sentencepiece def test_hf_index_retriever_call(self): import torch diff --git a/tests/test_tokenization_albert.py b/tests/test_tokenization_albert.py index 724b98327eb..a9ba4a57d94 100644 --- a/tests/test_tokenization_albert.py +++ b/tests/test_tokenization_albert.py @@ -17,7 +17,8 @@ import os import unittest -from transformers.tokenization_albert import AlbertTokenizer, AlbertTokenizerFast +from transformers import AlbertTokenizer, AlbertTokenizerFast +from transformers.testing_utils import require_sentencepiece, require_tokenizers from .test_tokenization_common import TokenizerTesterMixin @@ -25,6 +26,8 @@ from .test_tokenization_common import TokenizerTesterMixin SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/spiece.model") +@require_sentencepiece +@require_tokenizers class AlbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = AlbertTokenizer diff --git a/tests/test_tokenization_auto.py b/tests/test_tokenization_auto.py index 524a2282492..0633f4b36e9 100644 --- a/tests/test_tokenization_auto.py +++ b/tests/test_tokenization_auto.py @@ -33,6 +33,7 @@ from transformers.testing_utils import ( DUMMY_DIFF_TOKENIZER_IDENTIFIER, DUMMY_UNKWOWN_IDENTIFIER, SMALL_MODEL_IDENTIFIER, + require_tokenizers, ) from transformers.tokenization_auto import TOKENIZER_MAPPING @@ -70,6 +71,7 @@ class AutoTokenizerTest(unittest.TestCase): self.assertIsInstance(tokenizer, (BertTokenizer, BertTokenizerFast)) self.assertEqual(tokenizer.vocab_size, 12) + @require_tokenizers def test_tokenizer_identifier_with_correct_config(self): for tokenizer_class in [BertTokenizer, BertTokenizerFast, AutoTokenizer]: tokenizer = tokenizer_class.from_pretrained("wietsedv/bert-base-dutch-cased") @@ -82,6 +84,7 @@ class AutoTokenizerTest(unittest.TestCase): self.assertEqual(tokenizer.max_len, 512) + @require_tokenizers def test_tokenizer_identifier_non_existent(self): for tokenizer_class in [BertTokenizer, BertTokenizerFast, AutoTokenizer]: with self.assertRaises(EnvironmentError): @@ -101,12 +104,16 @@ class AutoTokenizerTest(unittest.TestCase): msg="Testing if {} is child of {}".format(child_config.__name__, parent_config.__name__) ): self.assertFalse(issubclass(child_config, parent_config)) - self.assertFalse(issubclass(child_model_py, parent_model_py)) + + # Check for Slow tokenizer implementation if provided + if child_model_py and parent_model_py: + self.assertFalse(issubclass(child_model_py, parent_model_py)) # Check for Fast tokenizer implementation if provided if child_model_fast and parent_model_fast: self.assertFalse(issubclass(child_model_fast, parent_model_fast)) + @require_tokenizers def test_from_pretrained_use_fast_toggle(self): self.assertIsInstance(AutoTokenizer.from_pretrained("bert-base-cased"), BertTokenizer) self.assertIsInstance(AutoTokenizer.from_pretrained("bert-base-cased", use_fast=True), BertTokenizerFast) diff --git a/tests/test_tokenization_bart.py b/tests/test_tokenization_bart.py index 0aa1c746848..3c6c88ef7ae 100644 --- a/tests/test_tokenization_bart.py +++ b/tests/test_tokenization_bart.py @@ -4,16 +4,19 @@ import unittest from transformers import BartTokenizer, BartTokenizerFast, BatchEncoding from transformers.file_utils import cached_property -from transformers.testing_utils import require_torch +from transformers.testing_utils import require_tokenizers, require_torch from transformers.tokenization_roberta import VOCAB_FILES_NAMES -from .test_tokenization_common import TokenizerTesterMixin +from .test_tokenization_common import TokenizerTesterMixin, filter_roberta_detectors +@require_tokenizers class TestTokenizationBart(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = BartTokenizer rust_tokenizer_class = BartTokenizerFast test_rust_tokenizer = True + from_pretrained_filter = filter_roberta_detectors + # from_pretrained_kwargs = {'add_prefix_space': True} def setUp(self): super().setUp() @@ -56,7 +59,7 @@ class TestTokenizationBart(TokenizerTesterMixin, unittest.TestCase): def get_rust_tokenizer(self, **kwargs): kwargs.update(self.special_tokens_map) - return BartTokenizerFast.from_pretrained(self.tmpdirname, **kwargs) + return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) def get_input_output_texts(self, tokenizer): return "lower newer", "lower newer" @@ -145,3 +148,38 @@ class TestTokenizationBart(TokenizerTesterMixin, unittest.TestCase): self.assertTrue((labels[:, 0] == tokenizer.bos_token_id).all().item()) self.assertTrue((input_ids[:, -1] == tokenizer.eos_token_id).all().item()) self.assertTrue((labels[:, -1] == tokenizer.eos_token_id).all().item()) + + def test_pretokenized_inputs(self): + pass + + def test_embeded_special_tokens(self): + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)): + tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + sentence = "A, AllenNLP sentence." + tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True) + tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True) + + # token_type_ids should put 0 everywhere + self.assertEqual(sum(tokens_r["token_type_ids"]), sum(tokens_p["token_type_ids"])) + + # attention_mask should put 1 everywhere, so sum over length should be 1 + self.assertEqual( + sum(tokens_r["attention_mask"]) / len(tokens_r["attention_mask"]), + sum(tokens_p["attention_mask"]) / len(tokens_p["attention_mask"]), + ) + + tokens_r_str = tokenizer_r.convert_ids_to_tokens(tokens_r["input_ids"]) + tokens_p_str = tokenizer_p.convert_ids_to_tokens(tokens_p["input_ids"]) + + # Rust correctly handles the space before the mask while python doesnt + self.assertSequenceEqual(tokens_p["input_ids"], [0, 250, 6, 50264, 3823, 487, 21992, 3645, 4, 2]) + self.assertSequenceEqual(tokens_r["input_ids"], [0, 250, 6, 50264, 3823, 487, 21992, 3645, 4, 2]) + + self.assertSequenceEqual( + tokens_p_str, ["", "A", ",", "", "Ä Allen", "N", "LP", "Ä sentence", ".", ""] + ) + self.assertSequenceEqual( + tokens_r_str, ["", "A", ",", "", "Ä Allen", "N", "LP", "Ä sentence", ".", ""] + ) diff --git a/tests/test_tokenization_bert.py b/tests/test_tokenization_bert.py index 04117d8b3bd..43bda26df45 100644 --- a/tests/test_tokenization_bert.py +++ b/tests/test_tokenization_bert.py @@ -17,27 +17,29 @@ import os import unittest -from transformers.testing_utils import slow +from transformers import BertTokenizerFast +from transformers.testing_utils import require_tokenizers, slow from transformers.tokenization_bert import ( VOCAB_FILES_NAMES, BasicTokenizer, BertTokenizer, - BertTokenizerFast, WordpieceTokenizer, _is_control, _is_punctuation, _is_whitespace, ) -from .test_tokenization_common import TokenizerTesterMixin +from .test_tokenization_common import TokenizerTesterMixin, filter_non_english +@require_tokenizers class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = BertTokenizer rust_tokenizer_class = BertTokenizerFast test_rust_tokenizer = True space_between_special_tokens = True + from_pretrained_filter = filter_non_english def setUp(self): super().setUp() @@ -245,3 +247,55 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): assert encoded_sentence == [101] + text + [102] assert encoded_pair == [101] + text + [102] + text_2 + [102] + + def test_offsets_with_special_characters(self): + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)): + tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + + sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence." + tokens = tokenizer_r.encode_plus( + sentence, + return_attention_mask=False, + return_token_type_ids=False, + return_offsets_mapping=True, + add_special_tokens=True, + ) + + do_lower_case = tokenizer_r.do_lower_case if hasattr(tokenizer_r, "do_lower_case") else False + expected_results = ( + [ + ((0, 0), tokenizer_r.cls_token), + ((0, 1), "A"), + ((1, 2), ","), + ((3, 5), "na"), + ((5, 6), "##ï"), + ((6, 8), "##ve"), + ((9, 15), tokenizer_r.mask_token), + ((16, 21), "Allen"), + ((21, 23), "##NL"), + ((23, 24), "##P"), + ((25, 33), "sentence"), + ((33, 34), "."), + ((0, 0), tokenizer_r.sep_token), + ] + if not do_lower_case + else [ + ((0, 0), tokenizer_r.cls_token), + ((0, 1), "a"), + ((1, 2), ","), + ((3, 8), "naive"), + ((9, 15), tokenizer_r.mask_token), + ((16, 21), "allen"), + ((21, 23), "##nl"), + ((23, 24), "##p"), + ((25, 33), "sentence"), + ((33, 34), "."), + ((0, 0), tokenizer_r.sep_token), + ] + ) + + self.assertEqual( + [e[1] for e in expected_results], tokenizer_r.convert_ids_to_tokens(tokens["input_ids"]) + ) + self.assertEqual([e[0] for e in expected_results], tokens["offset_mapping"]) diff --git a/tests/test_tokenization_bert_generation.py b/tests/test_tokenization_bert_generation.py index 1c635ee4a70..d1fc2f73499 100644 --- a/tests/test_tokenization_bert_generation.py +++ b/tests/test_tokenization_bert_generation.py @@ -17,9 +17,9 @@ import os import unittest +from transformers import BertGenerationTokenizer from transformers.file_utils import cached_property -from transformers.testing_utils import require_torch, slow -from transformers.tokenization_bert_generation import BertGenerationTokenizer +from transformers.testing_utils import require_sentencepiece, require_torch, slow from .test_tokenization_common import TokenizerTesterMixin @@ -29,6 +29,7 @@ SPIECE_UNDERLINE = "▁" SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model") +@require_sentencepiece class BertGenerationTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = BertGenerationTokenizer diff --git a/tests/test_tokenization_bert_japanese.py b/tests/test_tokenization_bert_japanese.py index 9953dc72d45..092237f1abd 100644 --- a/tests/test_tokenization_bert_japanese.py +++ b/tests/test_tokenization_bert_japanese.py @@ -19,12 +19,12 @@ import pickle import unittest from transformers.testing_utils import custom_tokenizers -from transformers.tokenization_bert import WordpieceTokenizer from transformers.tokenization_bert_japanese import ( VOCAB_FILES_NAMES, BertJapaneseTokenizer, CharacterTokenizer, MecabTokenizer, + WordpieceTokenizer, ) from .test_tokenization_common import TokenizerTesterMixin diff --git a/tests/test_tokenization_camembert.py b/tests/test_tokenization_camembert.py index c8eae66d48a..672399e9494 100644 --- a/tests/test_tokenization_camembert.py +++ b/tests/test_tokenization_camembert.py @@ -17,8 +17,8 @@ import os import unittest -from transformers.testing_utils import _torch_available -from transformers.tokenization_camembert import CamembertTokenizer, CamembertTokenizerFast +from transformers import CamembertTokenizer, CamembertTokenizerFast +from transformers.testing_utils import _torch_available, require_sentencepiece, require_tokenizers from .test_tokenization_common import TokenizerTesterMixin @@ -28,6 +28,8 @@ SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixture FRAMEWORK = "pt" if _torch_available else "tf" +@require_sentencepiece +@require_tokenizers class CamembertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = CamembertTokenizer diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index 74c57e5e3cf..d07cd9d4ef3 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -14,16 +14,18 @@ # limitations under the License. +import inspect import os import pickle import re import shutil import tempfile from collections import OrderedDict +from itertools import takewhile from typing import TYPE_CHECKING, Dict, List, Tuple, Union -from transformers import PreTrainedTokenizer, PreTrainedTokenizerBase, PreTrainedTokenizerFast -from transformers.testing_utils import require_tf, require_torch, slow +from transformers import PreTrainedTokenizer, PreTrainedTokenizerBase, PreTrainedTokenizerFast, is_torch_available +from transformers.testing_utils import get_tests_dir, require_tf, require_tokenizers, require_torch, slow from transformers.tokenization_utils import AddedToken @@ -31,6 +33,18 @@ if TYPE_CHECKING: from transformers import PretrainedConfig, PreTrainedModel, TFPreTrainedModel +NON_ENGLISH_TAGS = ["chinese", "dutch", "french", "finnish", "german", "multilingual"] + + +def filter_non_english(_, pretrained_name: str): + """ Filter all the model for non-english language """ + return not any([lang in pretrained_name for lang in NON_ENGLISH_TAGS]) + + +def filter_roberta_detectors(_, pretrained_name: str): + return "detector" not in pretrained_name + + def merge_model_tokenizer_mappings( model_mapping: Dict["PretrainedConfig", Union["PreTrainedModel", "TFPreTrainedModel"]], tokenizer_mapping: Dict["PretrainedConfig", Tuple["PreTrainedTokenizer", "PreTrainedTokenizerFast"]], @@ -59,8 +73,32 @@ class TokenizerTesterMixin: rust_tokenizer_class = None test_rust_tokenizer = False space_between_special_tokens = False + from_pretrained_kwargs = None + from_pretrained_filter = None + from_pretrained_vocab_key = "vocab_file" + + def setUp(self) -> None: + # Tokenizer.filter makes it possible to filter which Tokenizer to case based on all the + # information available in Tokenizer (name, rust class, python class, vocab key name) + if self.test_rust_tokenizer: + tokenizers_list = [ + ( + self.rust_tokenizer_class, + pretrained_name, + self.from_pretrained_kwargs if self.from_pretrained_kwargs is not None else {}, + ) + for pretrained_name in self.rust_tokenizer_class.pretrained_vocab_files_map[ + self.from_pretrained_vocab_key + ].keys() + if self.from_pretrained_filter is None + or (self.from_pretrained_filter is not None and self.from_pretrained_filter(pretrained_name)) + ] + self.tokenizers_list = tokenizers_list[:1] # Let's just test the first pretrained vocab for speed + else: + self.tokenizers_list = [] + with open(f"{get_tests_dir()}/fixtures/sample_text.txt", encoding="utf-8") as f_data: + self._data = f_data.read().replace("\n\n", "\n").strip() - def setUp(self): self.tmpdirname = tempfile.mkdtemp() def tearDown(self): @@ -123,6 +161,15 @@ class TokenizerTesterMixin: for i in range(len(batch_encode_plus_sequences["input_ids"])) ] + def test_rust_tokenizer_signature(self): + if not self.test_rust_tokenizer: + return + + signature = inspect.signature(self.rust_tokenizer_class.__init__) + + self.assertIn("tokenizer_file", signature.parameters) + self.assertIsNone(signature.parameters["tokenizer_file"].default) + def test_rust_and_python_full_tokenizers(self): if not self.test_rust_tokenizer: return @@ -206,7 +253,6 @@ class TokenizerTesterMixin: shutil.rmtree(tmpdirname) - # Now let's start the test tokenizers = self.get_tokenizers(model_max_length=42) for tokenizer in tokenizers: with self.subTest(f"{tokenizer.__class__.__name__}"): @@ -237,6 +283,39 @@ class TokenizerTesterMixin: shutil.rmtree(tmpdirname) + # Test that we can also use the non-legacy saving format for fast tokenizers + tokenizers = self.get_tokenizers(model_max_length=42) + for tokenizer in tokenizers: + if not tokenizer.is_fast: + continue + with self.subTest(f"{tokenizer.__class__.__name__}"): + # Isolate this from the other tests because we save additional tokens/etc + tmpdirname = tempfile.mkdtemp() + + sample_text = " He is very happy, UNwant\u00E9d,running" + tokenizer.add_tokens(["bim", "bambam"]) + additional_special_tokens = tokenizer.additional_special_tokens + additional_special_tokens.append("new_additional_special_token") + tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens}) + before_tokens = tokenizer.encode(sample_text, add_special_tokens=False) + before_vocab = tokenizer.get_vocab() + tokenizer.save_pretrained(tmpdirname) + + after_tokenizer = tokenizer.__class__.from_pretrained(tmpdirname) + after_tokens = after_tokenizer.encode(sample_text, add_special_tokens=False) + after_vocab = after_tokenizer.get_vocab() + self.assertListEqual(before_tokens, after_tokens) + self.assertDictEqual(before_vocab, after_vocab) + self.assertIn("bim", after_vocab) + self.assertIn("bambam", after_vocab) + self.assertIn("new_additional_special_token", after_tokenizer.additional_special_tokens) + self.assertEqual(after_tokenizer.model_max_length, 42) + + tokenizer = tokenizer.__class__.from_pretrained(tmpdirname, model_max_length=43) + self.assertEqual(tokenizer.model_max_length, 43) + + shutil.rmtree(tmpdirname) + def test_pickle_tokenizer(self): """Google pickle __getstate__ __setstate__ if you are struggling with this.""" tokenizers = self.get_tokenizers() @@ -258,6 +337,7 @@ class TokenizerTesterMixin: self.assertListEqual(subwords, subwords_loaded) + @require_tokenizers def test_pickle_added_tokens(self): tok1 = AddedToken("", rstrip=True, lstrip=True, normalized=False, single_word=True) tok2 = pickle.loads(pickle.dumps(tok1)) @@ -419,6 +499,7 @@ class TokenizerTesterMixin: self.assertEqual(text_2, output_text) + @require_tokenizers def test_encode_decode_with_spaces(self): tokenizers = self.get_tokenizers(do_lower_case=False) for tokenizer in tokenizers: @@ -437,6 +518,15 @@ class TokenizerTesterMixin: self.assertIn(decoded, [output, output.lower()]) def test_pretrained_model_lists(self): + # We should have at least one default checkpoint for each tokenizer + # We should specify the max input length as well (used in some part to list the pretrained checkpoints) + self.assertGreaterEqual(len(self.tokenizer_class.pretrained_vocab_files_map), 1) + self.assertGreaterEqual(len(list(self.tokenizer_class.pretrained_vocab_files_map.values())[0]), 1) + self.assertEqual( + len(list(self.tokenizer_class.pretrained_vocab_files_map.values())[0]), + len(self.tokenizer_class.max_model_input_sizes), + ) + weights_list = list(self.tokenizer_class.max_model_input_sizes.keys()) weights_lists_2 = [] for file_id, map_list in self.tokenizer_class.pretrained_vocab_files_map.items(): @@ -1226,6 +1316,7 @@ class TokenizerTesterMixin: encoded_sequences_batch_padded_2[key], ) + @require_tokenizers def test_added_token_serializable(self): tokenizers = self.get_tokenizers(do_lower_case=False) for tokenizer in tokenizers: @@ -1652,3 +1743,772 @@ class TokenizerTesterMixin: self.assertEqual(batch_encoder_only.input_ids.shape[1], 3) self.assertEqual(batch_encoder_only.attention_mask.shape[1], 3) self.assertNotIn("decoder_input_ids", batch_encoder_only) + + def test_is_fast(self): + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)): + tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + + # Check is_fast is set correctly + self.assertFalse(tokenizer_p.is_fast) + self.assertTrue(tokenizer_r.is_fast) + + def test_fast_only_inputs(self): + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)): + tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + + # Ensure None raise an error + self.assertRaises(TypeError, tokenizer_r.tokenize, None) + self.assertRaises(TypeError, tokenizer_r.encode, None) + self.assertRaises(TypeError, tokenizer_r.encode_plus, None) + self.assertRaises(TypeError, tokenizer_r.batch_encode_plus, None) + + def test_alignement_methods(self): + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)): + tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + + words = ["Wonderful", "no", "inspiration", "example", "with", "subtoken"] + text = " ".join(words) + batch_size = 3 + + encoding = tokenizer_r.encode_plus(text, add_special_tokens=False) + + batch_encoding = tokenizer_r.batch_encode_plus([text] * batch_size, add_special_tokens=False) + num_tokens = len(encoding["input_ids"]) + + last_word_index = len(words) - 1 + last_token_index = num_tokens - 1 + last_batch_index = batch_size - 1 + last_char_index = len(text) - 1 + + # words, tokens + self.assertEqual(len(encoding.words(0)), num_tokens) + self.assertEqual(max(encoding.words(0)), last_word_index) + self.assertEqual(min(encoding.words(0)), 0) + self.assertEqual(len(batch_encoding.words(last_batch_index)), num_tokens) + self.assertEqual(max(batch_encoding.words(last_batch_index)), last_word_index) + self.assertEqual(min(batch_encoding.words(last_batch_index)), 0) + self.assertEqual(len(encoding.tokens(0)), num_tokens) + + # Assert token_to_word + self.assertEqual(encoding.token_to_word(0), 0) + self.assertEqual(encoding.token_to_word(0, 0), 0) + self.assertEqual(encoding.token_to_word(last_token_index), last_word_index) + self.assertEqual(encoding.token_to_word(0, last_token_index), last_word_index) + self.assertEqual(batch_encoding.token_to_word(1, 0), 0) + self.assertEqual(batch_encoding.token_to_word(0, last_token_index), last_word_index) + self.assertEqual(batch_encoding.token_to_word(last_batch_index, last_token_index), last_word_index) + + # Assert word_to_tokens + self.assertEqual(encoding.word_to_tokens(0).start, 0) + self.assertEqual(encoding.word_to_tokens(0, 0).start, 0) + self.assertEqual(encoding.word_to_tokens(last_word_index).end, last_token_index + 1) + self.assertEqual(encoding.word_to_tokens(0, last_word_index).end, last_token_index + 1) + self.assertEqual(batch_encoding.word_to_tokens(1, 0).start, 0) + self.assertEqual(batch_encoding.word_to_tokens(0, last_word_index).end, last_token_index + 1) + self.assertEqual( + batch_encoding.word_to_tokens(last_batch_index, last_word_index).end, last_token_index + 1 + ) + + # Assert token_to_chars + self.assertEqual(encoding.token_to_chars(0).start, 0) + self.assertEqual(encoding.token_to_chars(0, 0).start, 0) + self.assertEqual(encoding.token_to_chars(last_token_index).end, last_char_index + 1) + self.assertEqual(encoding.token_to_chars(0, last_token_index).end, last_char_index + 1) + self.assertEqual(batch_encoding.token_to_chars(1, 0).start, 0) + self.assertEqual(batch_encoding.token_to_chars(0, last_token_index).end, last_char_index + 1) + self.assertEqual( + batch_encoding.token_to_chars(last_batch_index, last_token_index).end, last_char_index + 1 + ) + + # Assert char_to_token + self.assertEqual(encoding.char_to_token(0), 0) + self.assertEqual(encoding.char_to_token(0, 0), 0) + self.assertEqual(encoding.char_to_token(last_char_index), last_token_index) + self.assertEqual(encoding.char_to_token(0, last_char_index), last_token_index) + self.assertEqual(batch_encoding.char_to_token(1, 0), 0) + self.assertEqual(batch_encoding.char_to_token(0, last_char_index), last_token_index) + self.assertEqual(batch_encoding.char_to_token(last_batch_index, last_char_index), last_token_index) + + # Assert char_to_word + self.assertEqual(encoding.char_to_word(0), 0) + self.assertEqual(encoding.char_to_word(0, 0), 0) + self.assertEqual(encoding.char_to_word(last_char_index), last_word_index) + self.assertEqual(encoding.char_to_word(0, last_char_index), last_word_index) + self.assertEqual(batch_encoding.char_to_word(1, 0), 0) + self.assertEqual(batch_encoding.char_to_word(0, last_char_index), last_word_index) + self.assertEqual(batch_encoding.char_to_word(last_batch_index, last_char_index), last_word_index) + + # Assert word_to_chars + self.assertEqual(encoding.word_to_chars(0).start, 0) + self.assertEqual(encoding.word_to_chars(0, 0).start, 0) + self.assertEqual(encoding.word_to_chars(last_word_index).end, last_char_index + 1) + self.assertEqual(encoding.word_to_chars(0, last_word_index).end, last_char_index + 1) + self.assertEqual(batch_encoding.word_to_chars(1, 0).start, 0) + self.assertEqual(batch_encoding.word_to_chars(0, last_word_index).end, last_char_index + 1) + self.assertEqual( + batch_encoding.word_to_chars(last_batch_index, last_word_index).end, last_char_index + 1 + ) + + def test_tokenization_python_rust_equals(self): + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)): + tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + + # Ensure basic input match + input_p = tokenizer_p.encode_plus(self._data) + input_r = tokenizer_r.encode_plus(self._data) + + for key in filter(lambda x: x in ["input_ids", "token_type_ids", "attention_mask"], input_p.keys()): + self.assertSequenceEqual(input_p[key], input_r[key]) + + input_pairs_p = tokenizer_p.encode_plus(self._data, self._data) + input_pairs_r = tokenizer_r.encode_plus(self._data, self._data) + + for key in filter(lambda x: x in ["input_ids", "token_type_ids", "attention_mask"], input_p.keys()): + self.assertSequenceEqual(input_pairs_p[key], input_pairs_r[key]) + + # Ensure truncation match + input_p = tokenizer_p.encode_plus(self._data, max_length=512, truncation=True) + input_r = tokenizer_r.encode_plus(self._data, max_length=512, truncation=True) + + for key in filter(lambda x: x in ["input_ids", "token_type_ids", "attention_mask"], input_p.keys()): + self.assertSequenceEqual(input_p[key], input_r[key]) + + # Ensure truncation with stride match + input_p = tokenizer_p.encode_plus( + self._data, max_length=512, truncation=True, stride=3, return_overflowing_tokens=True + ) + input_r = tokenizer_r.encode_plus( + self._data, max_length=512, truncation=True, stride=3, return_overflowing_tokens=True + ) + + for key in filter(lambda x: x in ["input_ids", "token_type_ids", "attention_mask"], input_p.keys()): + self.assertSequenceEqual(input_p[key], input_r[key][0]) + + def test_num_special_tokens_to_add_equal(self): + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)): + tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + + # Check we have the same number of added_tokens for both pair and non-pair inputs. + self.assertEqual( + tokenizer_r.num_special_tokens_to_add(False), tokenizer_p.num_special_tokens_to_add(False) + ) + self.assertEqual( + tokenizer_r.num_special_tokens_to_add(True), tokenizer_p.num_special_tokens_to_add(True) + ) + + def test_max_length_equal(self): + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)): + tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + + # Check we have the correct max_length for both pair and non-pair inputs. + self.assertEqual(tokenizer_r.max_len_single_sentence, tokenizer_p.max_len_single_sentence) + self.assertEqual(tokenizer_r.max_len_sentences_pair, tokenizer_p.max_len_sentences_pair) + + def test_special_tokens_map_equal(self): + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)): + tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + + # Assert the set of special tokens match. + self.assertSequenceEqual( + tokenizer_p.special_tokens_map.items(), + tokenizer_r.special_tokens_map.items(), + ) + + def test_add_tokens(self): + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)): + tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + + vocab_size = len(tokenizer_r) + self.assertEqual(tokenizer_r.add_tokens(""), 0) + self.assertEqual(tokenizer_r.add_tokens("testoken"), 1) + self.assertEqual(tokenizer_r.add_tokens(["testoken1", "testtoken2"]), 2) + self.assertEqual(len(tokenizer_r), vocab_size + 3) + + self.assertEqual(tokenizer_r.add_special_tokens({}), 0) + self.assertEqual(tokenizer_r.add_special_tokens({"bos_token": "[BOS]", "eos_token": "[EOS]"}), 2) + self.assertRaises( + AssertionError, tokenizer_r.add_special_tokens, {"additional_special_tokens": ""} + ) + self.assertEqual(tokenizer_r.add_special_tokens({"additional_special_tokens": [""]}), 1) + self.assertEqual( + tokenizer_r.add_special_tokens({"additional_special_tokens": ["", ""]}), 2 + ) + self.assertEqual(len(tokenizer_r), vocab_size + 8) + + def test_offsets_mapping(self): + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)): + tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + + text = "Wonderful no inspiration example with subtoken" + pair = "Along with an awesome pair" + + # No pair + tokens_with_offsets = tokenizer_r.encode_plus( + text, return_special_tokens_mask=True, return_offsets_mapping=True, add_special_tokens=True + ) + added_tokens = tokenizer_r.num_special_tokens_to_add(False) + offsets = tokens_with_offsets["offset_mapping"] + + # Assert there is the same number of tokens and offsets + self.assertEqual(len(offsets), len(tokens_with_offsets["input_ids"])) + + # Assert there is online added_tokens special_tokens + self.assertEqual(sum(tokens_with_offsets["special_tokens_mask"]), added_tokens) + + # Pairs + tokens_with_offsets = tokenizer_r.encode_plus( + text, pair, return_special_tokens_mask=True, return_offsets_mapping=True, add_special_tokens=True + ) + added_tokens = tokenizer_r.num_special_tokens_to_add(True) + offsets = tokens_with_offsets["offset_mapping"] + + # Assert there is the same number of tokens and offsets + self.assertEqual(len(offsets), len(tokens_with_offsets["input_ids"])) + + # Assert there is online added_tokens special_tokens + self.assertEqual(sum(tokens_with_offsets["special_tokens_mask"]), added_tokens) + + def test_batch_encode_dynamic_overflowing(self): + """ + When calling batch_encode with multiple sequence it can returns different number of + overflowing encoding for each sequence: + [ + Sequence 1: [Encoding 1, Encoding 2], + Sequence 2: [Encoding 1], + Sequence 3: [Encoding 1, Encoding 2, ... Encoding N] + ] + This needs to be padded so that it can represented as a tensor + """ + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + tokenizer = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + + with self.subTest( + "{} ({}, {})".format(tokenizer.__class__.__name__, pretrained_name, tokenizer.__class__.__name__) + ): + + returned_tensor = "pt" if is_torch_available() else "tf" + + if not tokenizer.pad_token or tokenizer.pad_token_id < 0: + return + + tokens = tokenizer.encode_plus( + "HuggingFace is solving NLP one commit at a time", + max_length=6, + padding=True, + truncation=True, + return_tensors=returned_tensor, + return_overflowing_tokens=True, + ) + + for key in filter(lambda x: "overflow_to_sample_mapping" not in x, tokens.keys()): + self.assertEqual(len(tokens[key].shape), 2) + + # Mono sample + tokens = tokenizer.batch_encode_plus( + ["HuggingFace is solving NLP one commit at a time"], + max_length=6, + padding=True, + truncation="only_first", + return_tensors=returned_tensor, + return_overflowing_tokens=True, + ) + + for key in filter(lambda x: "overflow_to_sample_mapping" not in x, tokens.keys()): + self.assertEqual(len(tokens[key].shape), 2) + self.assertEqual(tokens[key].shape[-1], 6) + + # Multi sample + tokens = tokenizer.batch_encode_plus( + ["HuggingFace is solving NLP one commit at a time", "Very tiny input"], + max_length=6, + padding=True, + truncation="only_first", + return_tensors=returned_tensor, + return_overflowing_tokens=True, + ) + + for key in filter(lambda x: "overflow_to_sample_mapping" not in x, tokens.keys()): + self.assertEqual(len(tokens[key].shape), 2) + self.assertEqual(tokens[key].shape[-1], 6) + + def test_compare_pretokenized_inputs(self): + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)): + tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + + if hasattr(tokenizer_p, "add_prefix_space") and not tokenizer_p.add_prefix_space: + continue # Too hard to test for now + + # Input string + pretokenized_input_simple = "This is a sample input".split() + pretokenized_input_pair = "This is a sample pair".split() + + # Test encode for pretokenized inputs + output_r = tokenizer_r.encode( + pretokenized_input_simple, is_split_into_words=True, add_special_tokens=False + ) + output_p = tokenizer_p.encode( + pretokenized_input_simple, is_split_into_words=True, add_special_tokens=False + ) + self.assertEqual(output_p, output_r) + + kwargs = { + "is_split_into_words": True, + # "return_token_type_ids": True, # Use the defaults for each tokenizers + # "return_attention_mask": True, # Use the defaults for each tokenizers + "return_overflowing_tokens": False, + "return_special_tokens_mask": True, + "return_offsets_mapping": False, # Not implemented in python tokenizers + # "add_special_tokens": False, + } + batch_kwargs = { + "is_split_into_words": True, + # "return_token_type_ids": True, # Use the defaults for each tokenizers + # "return_attention_mask": True, # Use the defaults for each tokenizers + "return_overflowing_tokens": False, + "return_special_tokens_mask": True, + "return_offsets_mapping": False, # Not implemented in python tokenizers + # "add_special_tokens": False, + } + # Test encode_plus for pretokenized inputs + output_r = tokenizer_r.encode_plus(pretokenized_input_simple, **kwargs) + output_p = tokenizer_p.encode_plus(pretokenized_input_simple, **kwargs) + for key in output_p.keys(): + self.assertEqual(output_p[key], output_r[key]) + + # Test batch_encode_plus for pretokenized inputs + input_batch = ([pretokenized_input_simple] * 2) + [pretokenized_input_simple + pretokenized_input_pair] + output_r = tokenizer_r.batch_encode_plus(input_batch, **batch_kwargs) + output_p = tokenizer_p.batch_encode_plus(input_batch, **batch_kwargs) + for key in output_p.keys(): + self.assertEqual(output_p[key], output_r[key]) + + # Test encode for pretokenized inputs pairs + output_r = tokenizer_r.encode( + pretokenized_input_simple, pretokenized_input_pair, is_split_into_words=True + ) + output_p = tokenizer_p.encode( + pretokenized_input_simple, pretokenized_input_pair, is_split_into_words=True + ) + self.assertEqual(output_p, output_r) + + # Test encode_plus for pretokenized inputs + output_r = tokenizer_r.encode_plus(pretokenized_input_simple, pretokenized_input_pair, **kwargs) + output_p = tokenizer_p.encode_plus(pretokenized_input_simple, pretokenized_input_pair, **kwargs) + for key in output_p.keys(): + self.assertEqual(output_p[key], output_r[key]) + + # Test batch_encode_plus for pretokenized inputs + input_batch_pair = ([pretokenized_input_simple, pretokenized_input_pair] * 2) + [ + pretokenized_input_simple + pretokenized_input_pair, + pretokenized_input_pair, + ] + output_r = tokenizer_r.batch_encode_plus(input_batch_pair, **batch_kwargs) + output_p = tokenizer_p.batch_encode_plus(input_batch_pair, **batch_kwargs) + for key in output_p.keys(): + self.assertEqual(output_p[key], output_r[key]) + + def test_create_token_type_ids(self): + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)): + tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + input_simple = [1, 2, 3] + input_pair = [1, 2, 3] + + # Generate output + output_r = tokenizer_r.create_token_type_ids_from_sequences(input_simple) + output_p = tokenizer_p.create_token_type_ids_from_sequences(input_simple) + self.assertEqual(output_p, output_r) + + # Generate pair output + output_r = tokenizer_r.create_token_type_ids_from_sequences(input_simple, input_pair) + output_p = tokenizer_p.create_token_type_ids_from_sequences(input_simple, input_pair) + self.assertEqual(output_p, output_r) + + def test_build_inputs_with_special_tokens(self): + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)): + tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + # # Input string + # input_simple = tokenizer_p.tokenize("This is a sample input", add_special_tokens=False) + # input_pair = tokenizer_p.tokenize("This is a sample pair", add_special_tokens=False) + + # # Generate output + # output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple) + # output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple) + # self.assertEqual(output_p, output_r) + + # # Generate pair output + # output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple, input_pair) + # output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple, input_pair) + # self.assertEqual(output_p, output_r) + + # Input tokens id + input_simple = tokenizer_p.encode("This is a sample input", add_special_tokens=False) + input_pair = tokenizer_p.encode("This is a sample pair", add_special_tokens=False) + + # Generate output + output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple) + output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple) + self.assertEqual(output_p, output_r) + + # Generate pair output + output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple, input_pair) + output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple, input_pair) + self.assertEqual(output_p, output_r) + + def test_padding(self, max_length=50): + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)): + tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + + def assert_padded_input_match(input_r: list, input_p: list, max_length: int): + + # Ensure we match max_length + self.assertEqual(len(input_r), max_length) + self.assertEqual(len(input_p), max_length) + + # Ensure the number of padded tokens is the same + padded_tokens_r = list(takewhile(lambda i: i == tokenizer_r.pad_token_id, reversed(input_r))) + padded_tokens_p = list(takewhile(lambda i: i == tokenizer_p.pad_token_id, reversed(input_p))) + self.assertSequenceEqual(padded_tokens_r, padded_tokens_p) + + def assert_batch_padded_input_match(input_r: dict, input_p: dict, max_length: int): + for i_r in input_r.values(): + self.assertEqual(len(i_r), 2), self.assertEqual(len(i_r[0]), max_length), self.assertEqual( + len(i_r[1]), max_length + ) + self.assertEqual(len(i_r), 2), self.assertEqual(len(i_r[0]), max_length), self.assertEqual( + len(i_r[1]), max_length + ) + + for i_r, i_p in zip(input_r["input_ids"], input_p["input_ids"]): + assert_padded_input_match(i_r, i_p, max_length) + + for i_r, i_p in zip(input_r["attention_mask"], input_p["attention_mask"]): + self.assertSequenceEqual(i_r, i_p) + + # Encode - Simple input + input_r = tokenizer_r.encode("This is a simple input", max_length=max_length, pad_to_max_length=True) + input_p = tokenizer_p.encode("This is a simple input", max_length=max_length, pad_to_max_length=True) + assert_padded_input_match(input_r, input_p, max_length) + input_r = tokenizer_r.encode("This is a simple input", max_length=max_length, padding="max_length") + input_p = tokenizer_p.encode("This is a simple input", max_length=max_length, padding="max_length") + assert_padded_input_match(input_r, input_p, max_length) + + input_r = tokenizer_r.encode("This is a simple input", padding="longest") + input_p = tokenizer_p.encode("This is a simple input", padding=True) + assert_padded_input_match(input_r, input_p, len(input_r)) + + # Encode - Pair input + input_r = tokenizer_r.encode( + "This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True + ) + input_p = tokenizer_p.encode( + "This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True + ) + assert_padded_input_match(input_r, input_p, max_length) + input_r = tokenizer_r.encode( + "This is a simple input", "This is a pair", max_length=max_length, padding="max_length" + ) + input_p = tokenizer_p.encode( + "This is a simple input", "This is a pair", max_length=max_length, padding="max_length" + ) + assert_padded_input_match(input_r, input_p, max_length) + input_r = tokenizer_r.encode("This is a simple input", "This is a pair", padding=True) + input_p = tokenizer_p.encode("This is a simple input", "This is a pair", padding="longest") + assert_padded_input_match(input_r, input_p, len(input_r)) + + # Encode_plus - Simple input + input_r = tokenizer_r.encode_plus( + "This is a simple input", max_length=max_length, pad_to_max_length=True + ) + input_p = tokenizer_p.encode_plus( + "This is a simple input", max_length=max_length, pad_to_max_length=True + ) + assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length) + self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"]) + input_r = tokenizer_r.encode_plus( + "This is a simple input", max_length=max_length, padding="max_length" + ) + input_p = tokenizer_p.encode_plus( + "This is a simple input", max_length=max_length, padding="max_length" + ) + assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length) + self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"]) + + input_r = tokenizer_r.encode_plus("This is a simple input", padding="longest") + input_p = tokenizer_p.encode_plus("This is a simple input", padding=True) + assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"])) + + self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"]) + + # Encode_plus - Pair input + input_r = tokenizer_r.encode_plus( + "This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True + ) + input_p = tokenizer_p.encode_plus( + "This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True + ) + assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length) + self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"]) + input_r = tokenizer_r.encode_plus( + "This is a simple input", "This is a pair", max_length=max_length, padding="max_length" + ) + input_p = tokenizer_p.encode_plus( + "This is a simple input", "This is a pair", max_length=max_length, padding="max_length" + ) + assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length) + self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"]) + input_r = tokenizer_r.encode_plus("This is a simple input", "This is a pair", padding="longest") + input_p = tokenizer_p.encode_plus("This is a simple input", "This is a pair", padding=True) + assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"])) + self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"]) + + # Batch_encode_plus - Simple input + input_r = tokenizer_r.batch_encode_plus( + ["This is a simple input 1", "This is a simple input 2"], + max_length=max_length, + pad_to_max_length=True, + ) + input_p = tokenizer_p.batch_encode_plus( + ["This is a simple input 1", "This is a simple input 2"], + max_length=max_length, + pad_to_max_length=True, + ) + assert_batch_padded_input_match(input_r, input_p, max_length) + + input_r = tokenizer_r.batch_encode_plus( + ["This is a simple input 1", "This is a simple input 2"], + max_length=max_length, + padding="max_length", + ) + input_p = tokenizer_p.batch_encode_plus( + ["This is a simple input 1", "This is a simple input 2"], + max_length=max_length, + padding="max_length", + ) + assert_batch_padded_input_match(input_r, input_p, max_length) + + input_r = tokenizer_r.batch_encode_plus( + ["This is a simple input 1", "This is a simple input 2"], + max_length=max_length, + padding="longest", + ) + input_p = tokenizer_p.batch_encode_plus( + ["This is a simple input 1", "This is a simple input 2"], + max_length=max_length, + padding=True, + ) + assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0])) + + input_r = tokenizer_r.batch_encode_plus( + ["This is a simple input 1", "This is a simple input 2"], padding="longest" + ) + input_p = tokenizer_p.batch_encode_plus( + ["This is a simple input 1", "This is a simple input 2"], padding=True + ) + assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0])) + + # Batch_encode_plus - Pair input + input_r = tokenizer_r.batch_encode_plus( + [ + ("This is a simple input 1", "This is a simple input 2"), + ("This is a simple pair 1", "This is a simple pair 2"), + ], + max_length=max_length, + truncation=True, + padding="max_length", + ) + input_p = tokenizer_p.batch_encode_plus( + [ + ("This is a simple input 1", "This is a simple input 2"), + ("This is a simple pair 1", "This is a simple pair 2"), + ], + max_length=max_length, + truncation=True, + padding="max_length", + ) + assert_batch_padded_input_match(input_r, input_p, max_length) + + input_r = tokenizer_r.batch_encode_plus( + [ + ("This is a simple input 1", "This is a simple input 2"), + ("This is a simple pair 1", "This is a simple pair 2"), + ], + padding=True, + ) + input_p = tokenizer_p.batch_encode_plus( + [ + ("This is a simple input 1", "This is a simple input 2"), + ("This is a simple pair 1", "This is a simple pair 2"), + ], + padding="longest", + ) + assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0])) + + # Using pad on single examples after tokenization + input_r = tokenizer_r.encode_plus("This is a input 1") + input_r = tokenizer_r.pad(input_r) + + input_p = tokenizer_r.encode_plus("This is a input 1") + input_p = tokenizer_r.pad(input_p) + + assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"])) + + # Using pad on single examples after tokenization + input_r = tokenizer_r.encode_plus("This is a input 1") + input_r = tokenizer_r.pad(input_r, max_length=max_length, padding="max_length") + + input_p = tokenizer_r.encode_plus("This is a input 1") + input_p = tokenizer_r.pad(input_p, max_length=max_length, padding="max_length") + + assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length) + + # Using pad after tokenization + input_r = tokenizer_r.batch_encode_plus( + ["This is a input 1", "This is a much longer input whilch should be padded"] + ) + input_r = tokenizer_r.pad(input_r) + + input_p = tokenizer_r.batch_encode_plus( + ["This is a input 1", "This is a much longer input whilch should be padded"] + ) + input_p = tokenizer_r.pad(input_p) + + assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0])) + + # Using pad after tokenization + input_r = tokenizer_r.batch_encode_plus( + ["This is a input 1", "This is a much longer input whilch should be padded"] + ) + input_r = tokenizer_r.pad(input_r, max_length=max_length, padding="max_length") + + input_p = tokenizer_r.batch_encode_plus( + ["This is a input 1", "This is a much longer input whilch should be padded"] + ) + input_p = tokenizer_r.pad(input_p, max_length=max_length, padding="max_length") + + assert_batch_padded_input_match(input_r, input_p, max_length) + + def test_save_pretrained(self): + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)): + tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + + tmpdirname2 = tempfile.mkdtemp() + + tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2) + tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2) + # Checks it save with the same files + self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files) + + # Checks everything loads correctly in the same way + tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2) + tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2) + + # Check special tokens are set accordingly on Rust and Python + for key in tokenizer_pp.special_tokens_map: + self.assertTrue(hasattr(tokenizer_rp, key)) + # self.assertEqual(getattr(tokenizer_rp, key), getattr(tokenizer_pp, key)) + # self.assertEqual(getattr(tokenizer_rp, key + "_id"), getattr(tokenizer_pp, key + "_id")) + + shutil.rmtree(tmpdirname2) + + def test_embeded_special_tokens(self): + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)): + tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + sentence = "A, AllenNLP sentence." + tokens_r = tokenizer_r.encode_plus( + sentence, + add_special_tokens=True, + ) + tokens_p = tokenizer_p.encode_plus( + sentence, + add_special_tokens=True, + ) + + for key in tokens_p.keys(): + self.assertEqual(tokens_r[key], tokens_p[key]) + + if "token_type_ids" in tokens_r: + self.assertEqual(sum(tokens_r["token_type_ids"]), sum(tokens_p["token_type_ids"])) + + tokens_r = tokenizer_r.convert_ids_to_tokens(tokens_r["input_ids"]) + tokens_p = tokenizer_p.convert_ids_to_tokens(tokens_p["input_ids"]) + self.assertSequenceEqual(tokens_r, tokens_p) + + def test_compare_add_special_tokens(self): + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)): + tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + + simple_num_special_tokens_to_add = tokenizer_r.num_special_tokens_to_add(pair=False) + # pair_num_special_tokens_to_add = tokenizer_r.num_special_tokens_to_add(pair=True) + + for text in ["", " "]: + # tokenize() + no_special_tokens = tokenizer_r.tokenize(text, add_special_tokens=False) + with_special_tokens = tokenizer_r.tokenize(text, add_special_tokens=True) + self.assertEqual( + len(no_special_tokens), len(with_special_tokens) - simple_num_special_tokens_to_add + ) + + # encode() + no_special_tokens = tokenizer_r.encode(text, add_special_tokens=False) + with_special_tokens = tokenizer_r.encode(text, add_special_tokens=True) + self.assertEqual( + len(no_special_tokens), len(with_special_tokens) - simple_num_special_tokens_to_add + ) + + # encode_plus() + no_special_tokens = tokenizer_r.encode_plus(text, add_special_tokens=False) + with_special_tokens = tokenizer_r.encode_plus(text, add_special_tokens=True) + for key in no_special_tokens.keys(): + self.assertEqual( + len(no_special_tokens[key]), + len(with_special_tokens[key]) - simple_num_special_tokens_to_add, + ) + + # # batch_encode_plus + no_special_tokens = tokenizer_r.batch_encode_plus([text, text], add_special_tokens=False) + with_special_tokens = tokenizer_r.batch_encode_plus([text, text], add_special_tokens=True) + for key in no_special_tokens.keys(): + for i_no, i_with in zip(no_special_tokens[key], with_special_tokens[key]): + self.assertEqual(len(i_no), len(i_with) - simple_num_special_tokens_to_add) + + def test_compare_prepare_for_model(self): + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)): + tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + string_sequence = "Asserting that both tokenizers are equal" + python_output = tokenizer_p.prepare_for_model( + tokenizer_p.encode(string_sequence, add_special_tokens=False) + ) + rust_output = tokenizer_r.prepare_for_model( + tokenizer_r.encode(string_sequence, add_special_tokens=False) + ) + for key in python_output: + self.assertEqual(python_output[key], rust_output[key]) diff --git a/tests/test_tokenization_distilbert.py b/tests/test_tokenization_distilbert.py index b076e2c779c..7b75f55e304 100644 --- a/tests/test_tokenization_distilbert.py +++ b/tests/test_tokenization_distilbert.py @@ -14,12 +14,13 @@ # limitations under the License. -from transformers.testing_utils import slow -from transformers.tokenization_distilbert import DistilBertTokenizer, DistilBertTokenizerFast +from transformers import DistilBertTokenizer, DistilBertTokenizerFast +from transformers.testing_utils import require_tokenizers, slow from .test_tokenization_bert import BertTokenizationTest +@require_tokenizers class DistilBertTokenizationTest(BertTokenizationTest): tokenizer_class = DistilBertTokenizer diff --git a/tests/test_tokenization_dpr.py b/tests/test_tokenization_dpr.py index d9ec74ec5dc..bc5ccb319e7 100644 --- a/tests/test_tokenization_dpr.py +++ b/tests/test_tokenization_dpr.py @@ -14,8 +14,7 @@ # limitations under the License. -from transformers.testing_utils import slow -from transformers.tokenization_dpr import ( +from transformers import ( DPRContextEncoderTokenizer, DPRContextEncoderTokenizerFast, DPRQuestionEncoderTokenizer, @@ -24,11 +23,13 @@ from transformers.tokenization_dpr import ( DPRReaderTokenizer, DPRReaderTokenizerFast, ) +from transformers.testing_utils import require_tokenizers, slow from transformers.tokenization_utils_base import BatchEncoding from .test_tokenization_bert import BertTokenizationTest +@require_tokenizers class DPRContextEncoderTokenizationTest(BertTokenizationTest): tokenizer_class = DPRContextEncoderTokenizer @@ -36,6 +37,7 @@ class DPRContextEncoderTokenizationTest(BertTokenizationTest): test_rust_tokenizer = True +@require_tokenizers class DPRQuestionEncoderTokenizationTest(BertTokenizationTest): tokenizer_class = DPRQuestionEncoderTokenizer @@ -43,6 +45,7 @@ class DPRQuestionEncoderTokenizationTest(BertTokenizationTest): test_rust_tokenizer = True +@require_tokenizers class DPRReaderTokenizationTest(BertTokenizationTest): tokenizer_class = DPRReaderTokenizer diff --git a/tests/test_tokenization_fast.py b/tests/test_tokenization_fast.py deleted file mode 100644 index 818b357b011..00000000000 --- a/tests/test_tokenization_fast.py +++ /dev/null @@ -1,1076 +0,0 @@ -import logging -import shutil -import tempfile -import unittest -from collections import namedtuple -from itertools import takewhile - -from transformers import ( - AlbertTokenizer, - AlbertTokenizerFast, - BartTokenizer, - BartTokenizerFast, - BertTokenizer, - BertTokenizerFast, - CamembertTokenizer, - CamembertTokenizerFast, - DistilBertTokenizer, - DistilBertTokenizerFast, - DPRContextEncoderTokenizer, - DPRContextEncoderTokenizerFast, - DPRQuestionEncoderTokenizer, - DPRQuestionEncoderTokenizerFast, - DPRReaderTokenizer, - DPRReaderTokenizerFast, - FunnelTokenizer, - FunnelTokenizerFast, - GPT2Tokenizer, - GPT2TokenizerFast, - LxmertTokenizer, - LxmertTokenizerFast, - MBartTokenizer, - MBartTokenizerFast, - OpenAIGPTTokenizer, - OpenAIGPTTokenizerFast, - PegasusTokenizer, - PegasusTokenizerFast, - ReformerTokenizer, - ReformerTokenizerFast, - RobertaTokenizer, - RobertaTokenizerFast, - T5Tokenizer, - T5TokenizerFast, - XLMRobertaTokenizer, - XLMRobertaTokenizerFast, - XLNetTokenizer, - XLNetTokenizerFast, - is_torch_available, -) -from transformers.testing_utils import get_tests_dir - - -logger = logging.getLogger(__name__) - -NON_ENGLISH_TAGS = ["chinese", "dutch", "french", "finnish", "german", "multilingual"] -Tokenizer = namedtuple("Tokenizer", ["name", "rust_cls", "python_cls", "vocab_key", "filter", "kwargs"]) - - -def filter_non_english(_: Tokenizer, pretrained_name: str): - """ Filter all the model for non-english language """ - return not any([lang in pretrained_name for lang in NON_ENGLISH_TAGS]) - - -def filter_roberta_detectors(_: Tokenizer, pretrained_name: str): - return "detector" not in pretrained_name - - -class CommonFastTokenizerTest(unittest.TestCase): - - TOKENIZERS_CLASSES = frozenset([]) - - def setUp(self) -> None: - # Tokenizer.filter makes it possible to filter which Tokenizer to case based on all the - # information available in Tokenizer (name, rust class, python class, vocab key name) - self.tokenizers_list = [ - (tok_case, pretrained_name, dict(t for t in tok_case.kwargs) if tok_case.kwargs else {}) - for tok_case in self.TOKENIZERS_CLASSES - for pretrained_name in tok_case.python_cls.pretrained_vocab_files_map[tok_case.vocab_key].keys() - if tok_case.filter is None or (tok_case.filter is not None and tok_case.filter(tok_case, pretrained_name)) - ] - with open(f"{get_tests_dir()}/fixtures/sample_text.txt", encoding="utf-8") as f_data: - self._data = f_data.read().replace("\n\n", "\n").strip() - - self.tmpdirname = tempfile.mkdtemp() - - def tearDown(self): - shutil.rmtree(self.tmpdirname) - - def test_is_fast(self): - for tok_case, pretrained_name, kwargs in self.tokenizers_list: - with self.subTest("{} ({})".format(tok_case.name, pretrained_name)): - tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs) - - # Check is_fast is set correctly - self.assertFalse(tokenizer_p.is_fast) - self.assertTrue(tokenizer_r.is_fast) - - def test_fast_only_inputs(self): - for tok_case, pretrained_name, kwargs in self.tokenizers_list: - with self.subTest("{} ({})".format(tok_case.name, pretrained_name)): - tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs) - - # Ensure None raise an error - self.assertRaises(TypeError, tokenizer_r.tokenize, None) - self.assertRaises(TypeError, tokenizer_r.encode, None) - self.assertRaises(TypeError, tokenizer_r.encode_plus, None) - self.assertRaises(TypeError, tokenizer_r.batch_encode_plus, None) - - def test_alignement_methods(self): - for tok_case, pretrained_name, kwargs in self.tokenizers_list: - with self.subTest("{} ({})".format(tok_case.name, pretrained_name)): - tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs) - - words = ["Wonderful", "no", "inspiration", "example", "with", "subtoken"] - text = " ".join(words) - batch_size = 3 - - encoding = tokenizer_r.encode_plus(text, add_special_tokens=False) - - batch_encoding = tokenizer_r.batch_encode_plus([text] * batch_size, add_special_tokens=False) - num_tokens = len(encoding["input_ids"]) - - last_word_index = len(words) - 1 - last_token_index = num_tokens - 1 - last_batch_index = batch_size - 1 - last_char_index = len(text) - 1 - - # words, tokens - self.assertEqual(len(encoding.words(0)), num_tokens) - self.assertEqual(max(encoding.words(0)), last_word_index) - self.assertEqual(min(encoding.words(0)), 0) - self.assertEqual(len(batch_encoding.words(last_batch_index)), num_tokens) - self.assertEqual(max(batch_encoding.words(last_batch_index)), last_word_index) - self.assertEqual(min(batch_encoding.words(last_batch_index)), 0) - self.assertEqual(len(encoding.tokens(0)), num_tokens) - - # Assert token_to_word - self.assertEqual(encoding.token_to_word(0), 0) - self.assertEqual(encoding.token_to_word(0, 0), 0) - self.assertEqual(encoding.token_to_word(last_token_index), last_word_index) - self.assertEqual(encoding.token_to_word(0, last_token_index), last_word_index) - self.assertEqual(batch_encoding.token_to_word(1, 0), 0) - self.assertEqual(batch_encoding.token_to_word(0, last_token_index), last_word_index) - self.assertEqual(batch_encoding.token_to_word(last_batch_index, last_token_index), last_word_index) - - # Assert word_to_tokens - self.assertEqual(encoding.word_to_tokens(0).start, 0) - self.assertEqual(encoding.word_to_tokens(0, 0).start, 0) - self.assertEqual(encoding.word_to_tokens(last_word_index).end, last_token_index + 1) - self.assertEqual(encoding.word_to_tokens(0, last_word_index).end, last_token_index + 1) - self.assertEqual(batch_encoding.word_to_tokens(1, 0).start, 0) - self.assertEqual(batch_encoding.word_to_tokens(0, last_word_index).end, last_token_index + 1) - self.assertEqual( - batch_encoding.word_to_tokens(last_batch_index, last_word_index).end, last_token_index + 1 - ) - - # Assert token_to_chars - self.assertEqual(encoding.token_to_chars(0).start, 0) - self.assertEqual(encoding.token_to_chars(0, 0).start, 0) - self.assertEqual(encoding.token_to_chars(last_token_index).end, last_char_index + 1) - self.assertEqual(encoding.token_to_chars(0, last_token_index).end, last_char_index + 1) - self.assertEqual(batch_encoding.token_to_chars(1, 0).start, 0) - self.assertEqual(batch_encoding.token_to_chars(0, last_token_index).end, last_char_index + 1) - self.assertEqual( - batch_encoding.token_to_chars(last_batch_index, last_token_index).end, last_char_index + 1 - ) - - # Assert char_to_token - self.assertEqual(encoding.char_to_token(0), 0) - self.assertEqual(encoding.char_to_token(0, 0), 0) - self.assertEqual(encoding.char_to_token(last_char_index), last_token_index) - self.assertEqual(encoding.char_to_token(0, last_char_index), last_token_index) - self.assertEqual(batch_encoding.char_to_token(1, 0), 0) - self.assertEqual(batch_encoding.char_to_token(0, last_char_index), last_token_index) - self.assertEqual(batch_encoding.char_to_token(last_batch_index, last_char_index), last_token_index) - - # Assert char_to_word - self.assertEqual(encoding.char_to_word(0), 0) - self.assertEqual(encoding.char_to_word(0, 0), 0) - self.assertEqual(encoding.char_to_word(last_char_index), last_word_index) - self.assertEqual(encoding.char_to_word(0, last_char_index), last_word_index) - self.assertEqual(batch_encoding.char_to_word(1, 0), 0) - self.assertEqual(batch_encoding.char_to_word(0, last_char_index), last_word_index) - self.assertEqual(batch_encoding.char_to_word(last_batch_index, last_char_index), last_word_index) - - # Assert word_to_chars - self.assertEqual(encoding.word_to_chars(0).start, 0) - self.assertEqual(encoding.word_to_chars(0, 0).start, 0) - self.assertEqual(encoding.word_to_chars(last_word_index).end, last_char_index + 1) - self.assertEqual(encoding.word_to_chars(0, last_word_index).end, last_char_index + 1) - self.assertEqual(batch_encoding.word_to_chars(1, 0).start, 0) - self.assertEqual(batch_encoding.word_to_chars(0, last_word_index).end, last_char_index + 1) - self.assertEqual( - batch_encoding.word_to_chars(last_batch_index, last_word_index).end, last_char_index + 1 - ) - - def test_tokenization_python_rust_equals(self): - for tok_case, pretrained_name, kwargs in self.tokenizers_list: - with self.subTest("{} ({})".format(tok_case.name, pretrained_name)): - tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs) - - # Ensure basic input match - input_p = tokenizer_p.encode_plus(self._data) - input_r = tokenizer_r.encode_plus(self._data) - - for key in filter(lambda x: x in ["input_ids", "token_type_ids", "attention_mask"], input_p.keys()): - self.assertSequenceEqual(input_p[key], input_r[key]) - - input_pairs_p = tokenizer_p.encode_plus(self._data, self._data) - input_pairs_r = tokenizer_r.encode_plus(self._data, self._data) - - for key in filter(lambda x: x in ["input_ids", "token_type_ids", "attention_mask"], input_p.keys()): - self.assertSequenceEqual(input_pairs_p[key], input_pairs_r[key]) - - # Ensure truncation match - input_p = tokenizer_p.encode_plus(self._data, max_length=512, truncation=True) - input_r = tokenizer_r.encode_plus(self._data, max_length=512, truncation=True) - - for key in filter(lambda x: x in ["input_ids", "token_type_ids", "attention_mask"], input_p.keys()): - self.assertSequenceEqual(input_p[key], input_r[key]) - - # Ensure truncation with stride match - input_p = tokenizer_p.encode_plus( - self._data, max_length=512, truncation=True, stride=3, return_overflowing_tokens=True - ) - input_r = tokenizer_r.encode_plus( - self._data, max_length=512, truncation=True, stride=3, return_overflowing_tokens=True - ) - - for key in filter(lambda x: x in ["input_ids", "token_type_ids", "attention_mask"], input_p.keys()): - self.assertSequenceEqual(input_p[key], input_r[key][0]) - - def test_num_special_tokens_to_add_equal(self): - for tok_case, pretrained_name, kwargs in self.tokenizers_list: - with self.subTest("{} ({})".format(tok_case.name, pretrained_name)): - tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs) - - # Check we have the same number of added_tokens for both pair and non-pair inputs. - self.assertEqual( - tokenizer_r.num_special_tokens_to_add(False), tokenizer_p.num_special_tokens_to_add(False) - ) - self.assertEqual( - tokenizer_r.num_special_tokens_to_add(True), tokenizer_p.num_special_tokens_to_add(True) - ) - - def test_max_length_equal(self): - for tok_case, pretrained_name, kwargs in self.tokenizers_list: - with self.subTest("{} ({})".format(tok_case.name, pretrained_name)): - tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs) - - # Check we have the correct max_length for both pair and non-pair inputs. - self.assertEqual(tokenizer_r.max_len_single_sentence, tokenizer_p.max_len_single_sentence) - self.assertEqual(tokenizer_r.max_len_sentences_pair, tokenizer_p.max_len_sentences_pair) - - def test_special_tokens_map_equal(self): - for tok_case, pretrained_name, kwargs in self.tokenizers_list: - with self.subTest("{} ({})".format(tok_case.name, pretrained_name)): - tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs) - - # Assert the set of special tokens match. - self.assertSequenceEqual( - tokenizer_p.special_tokens_map.items(), - tokenizer_r.special_tokens_map.items(), - ) - - def test_add_tokens(self): - for tok_case, pretrained_name, kwargs in self.tokenizers_list: - with self.subTest("{} ({})".format(tok_case.name, pretrained_name)): - tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs) - - vocab_size = len(tokenizer_r) - self.assertEqual(tokenizer_r.add_tokens(""), 0) - self.assertEqual(tokenizer_r.add_tokens("testoken"), 1) - self.assertEqual(tokenizer_r.add_tokens(["testoken1", "testtoken2"]), 2) - self.assertEqual(len(tokenizer_r), vocab_size + 3) - - self.assertEqual(tokenizer_r.add_special_tokens({}), 0) - self.assertEqual(tokenizer_r.add_special_tokens({"bos_token": "[BOS]", "eos_token": "[EOS]"}), 2) - self.assertRaises( - AssertionError, tokenizer_r.add_special_tokens, {"additional_special_tokens": ""} - ) - self.assertEqual(tokenizer_r.add_special_tokens({"additional_special_tokens": [""]}), 1) - self.assertEqual( - tokenizer_r.add_special_tokens({"additional_special_tokens": ["", ""]}), 2 - ) - self.assertEqual(len(tokenizer_r), vocab_size + 8) - - def test_offsets_mapping(self): - for tok_case, pretrained_name, kwargs in self.tokenizers_list: - with self.subTest("{} ({})".format(tok_case.name, pretrained_name)): - tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs) - - text = "Wonderful no inspiration example with subtoken" - pair = "Along with an awesome pair" - - # No pair - tokens_with_offsets = tokenizer_r.encode_plus( - text, return_special_tokens_mask=True, return_offsets_mapping=True, add_special_tokens=True - ) - added_tokens = tokenizer_r.num_special_tokens_to_add(False) - offsets = tokens_with_offsets["offset_mapping"] - - # Assert there is the same number of tokens and offsets - self.assertEqual(len(offsets), len(tokens_with_offsets["input_ids"])) - - # Assert there is online added_tokens special_tokens - self.assertEqual(sum(tokens_with_offsets["special_tokens_mask"]), added_tokens) - - # Pairs - tokens_with_offsets = tokenizer_r.encode_plus( - text, pair, return_special_tokens_mask=True, return_offsets_mapping=True, add_special_tokens=True - ) - added_tokens = tokenizer_r.num_special_tokens_to_add(True) - offsets = tokens_with_offsets["offset_mapping"] - - # Assert there is the same number of tokens and offsets - self.assertEqual(len(offsets), len(tokens_with_offsets["input_ids"])) - - # Assert there is online added_tokens special_tokens - self.assertEqual(sum(tokens_with_offsets["special_tokens_mask"]), added_tokens) - - def test_batch_encode_dynamic_overflowing(self): - """ - When calling batch_encode with multiple sequence it can returns different number of - overflowing encoding for each sequence: - [ - Sequence 1: [Encoding 1, Encoding 2], - Sequence 2: [Encoding 1], - Sequence 3: [Encoding 1, Encoding 2, ... Encoding N] - ] - This needs to be padded so that it can represented as a tensor - """ - for tok_case, pretrained_name, kwargs in self.tokenizers_list: - tokenizer = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs) - - with self.subTest("{} ({}, {})".format(tok_case.name, pretrained_name, tokenizer.__class__.__name__)): - - returned_tensor = "pt" if is_torch_available() else "tf" - - if not tokenizer.pad_token or tokenizer.pad_token_id < 0: - return - - tokens = tokenizer.encode_plus( - "HuggingFace is solving NLP one commit at a time", - max_length=6, - padding=True, - truncation=True, - return_tensors=returned_tensor, - return_overflowing_tokens=True, - ) - - for key in filter(lambda x: "overflow_to_sample_mapping" not in x, tokens.keys()): - self.assertEqual(len(tokens[key].shape), 2) - - # Mono sample - tokens = tokenizer.batch_encode_plus( - ["HuggingFace is solving NLP one commit at a time"], - max_length=6, - padding=True, - truncation="only_first", - return_tensors=returned_tensor, - return_overflowing_tokens=True, - ) - - for key in filter(lambda x: "overflow_to_sample_mapping" not in x, tokens.keys()): - self.assertEqual(len(tokens[key].shape), 2) - self.assertEqual(tokens[key].shape[-1], 6) - - # Multi sample - tokens = tokenizer.batch_encode_plus( - ["HuggingFace is solving NLP one commit at a time", "Very tiny input"], - max_length=6, - padding=True, - truncation="only_first", - return_tensors=returned_tensor, - return_overflowing_tokens=True, - ) - - for key in filter(lambda x: "overflow_to_sample_mapping" not in x, tokens.keys()): - self.assertEqual(len(tokens[key].shape), 2) - self.assertEqual(tokens[key].shape[-1], 6) - - def test_pretokenized_inputs(self): - for tok_case, pretrained_name, kwargs in self.tokenizers_list: - with self.subTest("{} ({})".format(tok_case.name, pretrained_name)): - tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs) - # Input string - pretokenized_input_simple = "This is a sample input".split() - pretokenized_input_pair = "This is a sample pair".split() - - # Test encode for pretokenized inputs - output_r = tokenizer_r.encode( - pretokenized_input_simple, is_split_into_words=True, add_special_tokens=False - ) - output_p = tokenizer_p.encode( - pretokenized_input_simple, is_split_into_words=True, add_special_tokens=False - ) - self.assertEqual(output_p, output_r) - - kwargs = { - "is_split_into_words": True, - # "return_token_type_ids": True, # Use the defaults for each tokenizers - # "return_attention_mask": True, # Use the defaults for each tokenizers - "return_overflowing_tokens": False, - "return_special_tokens_mask": True, - "return_offsets_mapping": False, # Not implemented in python tokenizers - # "add_special_tokens": False, - } - batch_kwargs = { - "is_split_into_words": True, - # "return_token_type_ids": True, # Use the defaults for each tokenizers - # "return_attention_mask": True, # Use the defaults for each tokenizers - "return_overflowing_tokens": False, - "return_special_tokens_mask": True, - "return_offsets_mapping": False, # Not implemented in python tokenizers - # "add_special_tokens": False, - } - # Test encode_plus for pretokenized inputs - output_r = tokenizer_r.encode_plus(pretokenized_input_simple, **kwargs) - output_p = tokenizer_p.encode_plus(pretokenized_input_simple, **kwargs) - for key in output_p.keys(): - self.assertEqual(output_p[key], output_r[key]) - - # Test batch_encode_plus for pretokenized inputs - input_batch = ([pretokenized_input_simple] * 2) + [pretokenized_input_simple + pretokenized_input_pair] - output_r = tokenizer_r.batch_encode_plus(input_batch, **batch_kwargs) - output_p = tokenizer_p.batch_encode_plus(input_batch, **batch_kwargs) - for key in output_p.keys(): - self.assertEqual(output_p[key], output_r[key]) - - # Test encode for pretokenized inputs pairs - output_r = tokenizer_r.encode( - pretokenized_input_simple, pretokenized_input_pair, is_split_into_words=True - ) - output_p = tokenizer_p.encode( - pretokenized_input_simple, pretokenized_input_pair, is_split_into_words=True - ) - self.assertEqual(output_p, output_r) - - # Test encode_plus for pretokenized inputs - output_r = tokenizer_r.encode_plus(pretokenized_input_simple, pretokenized_input_pair, **kwargs) - output_p = tokenizer_p.encode_plus(pretokenized_input_simple, pretokenized_input_pair, **kwargs) - for key in output_p.keys(): - self.assertEqual(output_p[key], output_r[key]) - - # Test batch_encode_plus for pretokenized inputs - input_batch_pair = ([pretokenized_input_simple, pretokenized_input_pair] * 2) + [ - pretokenized_input_simple + pretokenized_input_pair, - pretokenized_input_pair, - ] - output_r = tokenizer_r.batch_encode_plus(input_batch_pair, **batch_kwargs) - output_p = tokenizer_p.batch_encode_plus(input_batch_pair, **batch_kwargs) - for key in output_p.keys(): - self.assertEqual(output_p[key], output_r[key]) - - def test_create_token_type_ids(self): - for tok_case, pretrained_name, kwargs in self.tokenizers_list: - with self.subTest("{} ({})".format(tok_case.name, pretrained_name)): - tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs) - input_simple = [1, 2, 3] - input_pair = [1, 2, 3] - - # Generate output - output_r = tokenizer_r.create_token_type_ids_from_sequences(input_simple) - output_p = tokenizer_p.create_token_type_ids_from_sequences(input_simple) - self.assertEqual(output_p, output_r) - - # Generate pair output - output_r = tokenizer_r.create_token_type_ids_from_sequences(input_simple, input_pair) - output_p = tokenizer_p.create_token_type_ids_from_sequences(input_simple, input_pair) - self.assertEqual(output_p, output_r) - - def test_build_inputs_with_special_tokens(self): - for tok_case, pretrained_name, kwargs in self.tokenizers_list: - with self.subTest("{} ({})".format(tok_case.name, pretrained_name)): - tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs) - # # Input string - # input_simple = tokenizer_p.tokenize("This is a sample input", add_special_tokens=False) - # input_pair = tokenizer_p.tokenize("This is a sample pair", add_special_tokens=False) - - # # Generate output - # output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple) - # output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple) - # self.assertEqual(output_p, output_r) - - # # Generate pair output - # output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple, input_pair) - # output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple, input_pair) - # self.assertEqual(output_p, output_r) - - # Input tokens id - input_simple = tokenizer_p.encode("This is a sample input", add_special_tokens=False) - input_pair = tokenizer_p.encode("This is a sample pair", add_special_tokens=False) - - # Generate output - output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple) - output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple) - self.assertEqual(output_p, output_r) - - # Generate pair output - output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple, input_pair) - output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple, input_pair) - self.assertEqual(output_p, output_r) - - def test_padding(self, max_length=50): - for tok_case, pretrained_name, kwargs in self.tokenizers_list: - with self.subTest("{} ({})".format(tok_case.name, pretrained_name)): - tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs) - - def assert_padded_input_match(input_r: list, input_p: list, max_length: int): - - # Ensure we match max_length - self.assertEqual(len(input_r), max_length) - self.assertEqual(len(input_p), max_length) - - # Ensure the number of padded tokens is the same - padded_tokens_r = list(takewhile(lambda i: i == tokenizer_r.pad_token_id, reversed(input_r))) - padded_tokens_p = list(takewhile(lambda i: i == tokenizer_p.pad_token_id, reversed(input_p))) - self.assertSequenceEqual(padded_tokens_r, padded_tokens_p) - - def assert_batch_padded_input_match(input_r: dict, input_p: dict, max_length: int): - for i_r in input_r.values(): - self.assertEqual(len(i_r), 2), self.assertEqual(len(i_r[0]), max_length), self.assertEqual( - len(i_r[1]), max_length - ) - self.assertEqual(len(i_r), 2), self.assertEqual(len(i_r[0]), max_length), self.assertEqual( - len(i_r[1]), max_length - ) - - for i_r, i_p in zip(input_r["input_ids"], input_p["input_ids"]): - assert_padded_input_match(i_r, i_p, max_length) - - for i_r, i_p in zip(input_r["attention_mask"], input_p["attention_mask"]): - self.assertSequenceEqual(i_r, i_p) - - # Encode - Simple input - input_r = tokenizer_r.encode("This is a simple input", max_length=max_length, pad_to_max_length=True) - input_p = tokenizer_p.encode("This is a simple input", max_length=max_length, pad_to_max_length=True) - assert_padded_input_match(input_r, input_p, max_length) - input_r = tokenizer_r.encode("This is a simple input", max_length=max_length, padding="max_length") - input_p = tokenizer_p.encode("This is a simple input", max_length=max_length, padding="max_length") - assert_padded_input_match(input_r, input_p, max_length) - - input_r = tokenizer_r.encode("This is a simple input", padding="longest") - input_p = tokenizer_p.encode("This is a simple input", padding=True) - assert_padded_input_match(input_r, input_p, len(input_r)) - - # Encode - Pair input - input_r = tokenizer_r.encode( - "This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True - ) - input_p = tokenizer_p.encode( - "This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True - ) - assert_padded_input_match(input_r, input_p, max_length) - input_r = tokenizer_r.encode( - "This is a simple input", "This is a pair", max_length=max_length, padding="max_length" - ) - input_p = tokenizer_p.encode( - "This is a simple input", "This is a pair", max_length=max_length, padding="max_length" - ) - assert_padded_input_match(input_r, input_p, max_length) - input_r = tokenizer_r.encode("This is a simple input", "This is a pair", padding=True) - input_p = tokenizer_p.encode("This is a simple input", "This is a pair", padding="longest") - assert_padded_input_match(input_r, input_p, len(input_r)) - - # Encode_plus - Simple input - input_r = tokenizer_r.encode_plus( - "This is a simple input", max_length=max_length, pad_to_max_length=True - ) - input_p = tokenizer_p.encode_plus( - "This is a simple input", max_length=max_length, pad_to_max_length=True - ) - assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length) - self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"]) - input_r = tokenizer_r.encode_plus( - "This is a simple input", max_length=max_length, padding="max_length" - ) - input_p = tokenizer_p.encode_plus( - "This is a simple input", max_length=max_length, padding="max_length" - ) - assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length) - self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"]) - - input_r = tokenizer_r.encode_plus("This is a simple input", padding="longest") - input_p = tokenizer_p.encode_plus("This is a simple input", padding=True) - assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"])) - - self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"]) - - # Encode_plus - Pair input - input_r = tokenizer_r.encode_plus( - "This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True - ) - input_p = tokenizer_p.encode_plus( - "This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True - ) - assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length) - self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"]) - input_r = tokenizer_r.encode_plus( - "This is a simple input", "This is a pair", max_length=max_length, padding="max_length" - ) - input_p = tokenizer_p.encode_plus( - "This is a simple input", "This is a pair", max_length=max_length, padding="max_length" - ) - assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length) - self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"]) - input_r = tokenizer_r.encode_plus("This is a simple input", "This is a pair", padding="longest") - input_p = tokenizer_p.encode_plus("This is a simple input", "This is a pair", padding=True) - assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"])) - self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"]) - - # Batch_encode_plus - Simple input - input_r = tokenizer_r.batch_encode_plus( - ["This is a simple input 1", "This is a simple input 2"], - max_length=max_length, - pad_to_max_length=True, - ) - input_p = tokenizer_p.batch_encode_plus( - ["This is a simple input 1", "This is a simple input 2"], - max_length=max_length, - pad_to_max_length=True, - ) - assert_batch_padded_input_match(input_r, input_p, max_length) - - input_r = tokenizer_r.batch_encode_plus( - ["This is a simple input 1", "This is a simple input 2"], - max_length=max_length, - padding="max_length", - ) - input_p = tokenizer_p.batch_encode_plus( - ["This is a simple input 1", "This is a simple input 2"], - max_length=max_length, - padding="max_length", - ) - assert_batch_padded_input_match(input_r, input_p, max_length) - - input_r = tokenizer_r.batch_encode_plus( - ["This is a simple input 1", "This is a simple input 2"], - max_length=max_length, - padding="longest", - ) - input_p = tokenizer_p.batch_encode_plus( - ["This is a simple input 1", "This is a simple input 2"], - max_length=max_length, - padding=True, - ) - assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0])) - - input_r = tokenizer_r.batch_encode_plus( - ["This is a simple input 1", "This is a simple input 2"], padding="longest" - ) - input_p = tokenizer_p.batch_encode_plus( - ["This is a simple input 1", "This is a simple input 2"], padding=True - ) - assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0])) - - # Batch_encode_plus - Pair input - input_r = tokenizer_r.batch_encode_plus( - [ - ("This is a simple input 1", "This is a simple input 2"), - ("This is a simple pair 1", "This is a simple pair 2"), - ], - max_length=max_length, - truncation=True, - padding="max_length", - ) - input_p = tokenizer_p.batch_encode_plus( - [ - ("This is a simple input 1", "This is a simple input 2"), - ("This is a simple pair 1", "This is a simple pair 2"), - ], - max_length=max_length, - truncation=True, - padding="max_length", - ) - assert_batch_padded_input_match(input_r, input_p, max_length) - - input_r = tokenizer_r.batch_encode_plus( - [ - ("This is a simple input 1", "This is a simple input 2"), - ("This is a simple pair 1", "This is a simple pair 2"), - ], - padding=True, - ) - input_p = tokenizer_p.batch_encode_plus( - [ - ("This is a simple input 1", "This is a simple input 2"), - ("This is a simple pair 1", "This is a simple pair 2"), - ], - padding="longest", - ) - assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0])) - - # Using pad on single examples after tokenization - input_r = tokenizer_r.encode_plus("This is a input 1") - input_r = tokenizer_r.pad(input_r) - - input_p = tokenizer_r.encode_plus("This is a input 1") - input_p = tokenizer_r.pad(input_p) - - assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"])) - - # Using pad on single examples after tokenization - input_r = tokenizer_r.encode_plus("This is a input 1") - input_r = tokenizer_r.pad(input_r, max_length=max_length, padding="max_length") - - input_p = tokenizer_r.encode_plus("This is a input 1") - input_p = tokenizer_r.pad(input_p, max_length=max_length, padding="max_length") - - assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length) - - # Using pad after tokenization - input_r = tokenizer_r.batch_encode_plus( - ["This is a input 1", "This is a much longer input whilch should be padded"] - ) - input_r = tokenizer_r.pad(input_r) - - input_p = tokenizer_r.batch_encode_plus( - ["This is a input 1", "This is a much longer input whilch should be padded"] - ) - input_p = tokenizer_r.pad(input_p) - - assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0])) - - # Using pad after tokenization - input_r = tokenizer_r.batch_encode_plus( - ["This is a input 1", "This is a much longer input whilch should be padded"] - ) - input_r = tokenizer_r.pad(input_r, max_length=max_length, padding="max_length") - - input_p = tokenizer_r.batch_encode_plus( - ["This is a input 1", "This is a much longer input whilch should be padded"] - ) - input_p = tokenizer_r.pad(input_p, max_length=max_length, padding="max_length") - - assert_batch_padded_input_match(input_r, input_p, max_length) - - def test_save_pretrained(self): - for tok_case, pretrained_name, kwargs in self.tokenizers_list: - with self.subTest("{} ({})".format(tok_case.name, pretrained_name)): - tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs) - # Checks it save with the same files - self.assertSequenceEqual( - tokenizer_r.save_vocabulary(self.tmpdirname), tokenizer_p.save_vocabulary(self.tmpdirname) - ) - - # Checks everything loads correctly in the same way - tokenizer_rp, tokenizer_pp = tokenizer_r.from_pretrained(self.tmpdirname), tokenizer_p.from_pretrained( - self.tmpdirname - ) - - # Check special tokens are set accordingly on Rust and Python - for key in tokenizer_pp.special_tokens_map: - self.assertTrue(hasattr(tokenizer_rp, key)) - # self.assertEqual(getattr(tokenizer_rp, key), getattr(tokenizer_pp, key)) - # self.assertEqual(getattr(tokenizer_rp, key + "_id"), getattr(tokenizer_pp, key + "_id")) - - def test_embeded_special_tokens(self): - for tok_case, pretrained_name, kwargs in self.tokenizers_list: - with self.subTest("{} ({})".format(tok_case.name, pretrained_name)): - tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs) - sentence = "A, AllenNLP sentence." - tokens_r = tokenizer_r.encode_plus( - sentence, - add_special_tokens=True, - ) - tokens_p = tokenizer_p.encode_plus( - sentence, - add_special_tokens=True, - ) - - for key in tokens_p.keys(): - self.assertEqual(tokens_r[key], tokens_p[key]) - - if "token_type_ids" in tokens_r: - self.assertEqual(sum(tokens_r["token_type_ids"]), sum(tokens_p["token_type_ids"])) - - tokens_r = tokenizer_r.convert_ids_to_tokens(tokens_r["input_ids"]) - tokens_p = tokenizer_p.convert_ids_to_tokens(tokens_p["input_ids"]) - self.assertSequenceEqual(tokens_r, tokens_p) - - def test_add_special_tokens(self): - for tok_case, pretrained_name, kwargs in self.tokenizers_list: - with self.subTest("{} ({})".format(tok_case.name, pretrained_name)): - tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs) - - simple_num_special_tokens_to_add = tokenizer_r.num_special_tokens_to_add(pair=False) - # pair_num_special_tokens_to_add = tokenizer_r.num_special_tokens_to_add(pair=True) - - for text in ["", " "]: - # tokenize() - no_special_tokens = tokenizer_r.tokenize(text, add_special_tokens=False) - with_special_tokens = tokenizer_r.tokenize(text, add_special_tokens=True) - self.assertEqual( - len(no_special_tokens), len(with_special_tokens) - simple_num_special_tokens_to_add - ) - - # encode() - no_special_tokens = tokenizer_r.encode(text, add_special_tokens=False) - with_special_tokens = tokenizer_r.encode(text, add_special_tokens=True) - self.assertEqual( - len(no_special_tokens), len(with_special_tokens) - simple_num_special_tokens_to_add - ) - - # encode_plus() - no_special_tokens = tokenizer_r.encode_plus(text, add_special_tokens=False) - with_special_tokens = tokenizer_r.encode_plus(text, add_special_tokens=True) - for key in no_special_tokens.keys(): - self.assertEqual( - len(no_special_tokens[key]), - len(with_special_tokens[key]) - simple_num_special_tokens_to_add, - ) - - # # batch_encode_plus - no_special_tokens = tokenizer_r.batch_encode_plus([text, text], add_special_tokens=False) - with_special_tokens = tokenizer_r.batch_encode_plus([text, text], add_special_tokens=True) - for key in no_special_tokens.keys(): - for i_no, i_with in zip(no_special_tokens[key], with_special_tokens[key]): - self.assertEqual(len(i_no), len(i_with) - simple_num_special_tokens_to_add) - - def test_prepare_for_model(self): - for tok_case, pretrained_name, kwargs in self.tokenizers_list: - with self.subTest("{} ({})".format(tok_case.name, pretrained_name)): - tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs) - string_sequence = "Asserting that both tokenizers are equal" - python_output = tokenizer_p.prepare_for_model( - tokenizer_p.encode(string_sequence, add_special_tokens=False) - ) - rust_output = tokenizer_r.prepare_for_model( - tokenizer_r.encode(string_sequence, add_special_tokens=False) - ) - for key in python_output: - self.assertEqual(python_output[key], rust_output[key]) - - -class WordPieceFastTokenizerTest(CommonFastTokenizerTest): - """ - Override all the specific methods to test WordPiece behavior - """ - - TOKENIZERS_CLASSES = frozenset( - [ - Tokenizer("Bert", BertTokenizerFast, BertTokenizer, "vocab_file", filter_non_english, None), - Tokenizer( - "DistilBert", DistilBertTokenizerFast, DistilBertTokenizer, "vocab_file", filter_non_english, None - ), - Tokenizer( - "DPRReaderTokenizer", - DPRReaderTokenizerFast, - DPRReaderTokenizer, - "vocab_file", - filter_non_english, - None, - ), - Tokenizer( - "DPRQuestionEncoderTokenizer", - DPRQuestionEncoderTokenizerFast, - DPRQuestionEncoderTokenizer, - "vocab_file", - filter_non_english, - None, - ), - Tokenizer( - "DPRContextEncoderTokenizer", - DPRContextEncoderTokenizerFast, - DPRContextEncoderTokenizer, - "vocab_file", - filter_non_english, - None, - ), - Tokenizer("FunnelTokenizer", FunnelTokenizerFast, FunnelTokenizer, "vocab_file", filter_non_english, None), - Tokenizer("LxmertTokenizer", LxmertTokenizerFast, LxmertTokenizer, "vocab_file", filter_non_english, None), - ] - ) - - def test_offsets_with_special_characters(self): - for tok_case, pretrained_name, kwargs in self.tokenizers_list: - with self.subTest("{} ({})".format(tok_case.name, pretrained_name)): - tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs) - - sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence." - tokens = tokenizer_r.encode_plus( - sentence, - return_attention_mask=False, - return_token_type_ids=False, - return_offsets_mapping=True, - add_special_tokens=True, - ) - - do_lower_case = tokenizer_r.do_lower_case if hasattr(tokenizer_r, "do_lower_case") else False - expected_results = ( - [ - ((0, 0), tokenizer_r.cls_token), - ((0, 1), "A"), - ((1, 2), ","), - ((3, 5), "na"), - ((5, 6), "##ï"), - ((6, 8), "##ve"), - ((9, 15), tokenizer_r.mask_token), - ((16, 21), "Allen"), - ((21, 23), "##NL"), - ((23, 24), "##P"), - ((25, 33), "sentence"), - ((33, 34), "."), - ((0, 0), tokenizer_r.sep_token), - ] - if not do_lower_case - else [ - ((0, 0), tokenizer_r.cls_token), - ((0, 1), "a"), - ((1, 2), ","), - ((3, 8), "naive"), - ((9, 15), tokenizer_r.mask_token), - ((16, 21), "allen"), - ((21, 23), "##nl"), - ((23, 24), "##p"), - ((25, 33), "sentence"), - ((33, 34), "."), - ((0, 0), tokenizer_r.sep_token), - ] - ) - - self.assertEqual( - [e[1] for e in expected_results], tokenizer_r.convert_ids_to_tokens(tokens["input_ids"]) - ) - self.assertEqual([e[0] for e in expected_results], tokens["offset_mapping"]) - - -class RobertaFastTokenizerTest(CommonFastTokenizerTest): - TOKENIZERS_CLASSES = frozenset( - [ - Tokenizer( - "Roberta", - RobertaTokenizerFast, - RobertaTokenizer, - "vocab_file", - filter_roberta_detectors, - (("cls_token", ""),), - ), - Tokenizer( - "Bart", - BartTokenizerFast, - BartTokenizer, - "vocab_file", - None, - None, - ), - ] - ) - - def test_pretokenized_inputs(self): - pass - - def test_embeded_special_tokens(self): - for tok_case, pretrained_name, kwargs in self.tokenizers_list: - with self.subTest("{} ({})".format(tok_case.name, pretrained_name)): - tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs) - tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs) - sentence = "A, AllenNLP sentence." - tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True) - tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True) - - # token_type_ids should put 0 everywhere - self.assertEqual(sum(tokens_r["token_type_ids"]), sum(tokens_p["token_type_ids"])) - - # attention_mask should put 1 everywhere, so sum over length should be 1 - self.assertEqual( - sum(tokens_r["attention_mask"]) / len(tokens_r["attention_mask"]), - sum(tokens_p["attention_mask"]) / len(tokens_p["attention_mask"]), - ) - - tokens_r_str = tokenizer_r.convert_ids_to_tokens(tokens_r["input_ids"]) - tokens_p_str = tokenizer_p.convert_ids_to_tokens(tokens_p["input_ids"]) - - # Rust correctly handles the space before the mask while python doesnt - self.assertSequenceEqual(tokens_p["input_ids"], [0, 250, 6, 50264, 3823, 487, 21992, 3645, 4, 2]) - self.assertSequenceEqual(tokens_r["input_ids"], [0, 250, 6, 50264, 3823, 487, 21992, 3645, 4, 2]) - - self.assertSequenceEqual( - tokens_p_str, ["", "A", ",", "", "Ä Allen", "N", "LP", "Ä sentence", ".", ""] - ) - self.assertSequenceEqual( - tokens_r_str, ["", "A", ",", "", "Ä Allen", "N", "LP", "Ä sentence", ".", ""] - ) - - -class NoPaddingTokenFastTokenizerMatchingTest(CommonFastTokenizerTest): - TOKENIZERS_CLASSES = [ - Tokenizer("OpenAI GPT", OpenAIGPTTokenizerFast, OpenAIGPTTokenizer, "vocab_file", None, None), - Tokenizer("GPT2", GPT2TokenizerFast, GPT2Tokenizer, "vocab_file", None, [("add_prefix_space", True)]), - ] - - def test_pretokenized_inputs(self): - pass - - def test_padding(self, max_length=15): - for tok_case, pretrained_name, kwargs in self.tokenizers_list: - with self.subTest("{} ({})".format(tok_case.name, pretrained_name)): - tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs) - - # Simple input - s = "This is a simple input" - s2 = ["This is a simple input 1", "This is a simple input 2"] - p = ("This is a simple input", "This is a pair") - p2 = [ - ("This is a simple input 1", "This is a simple input 2"), - ("This is a simple pair 1", "This is a simple pair 2"), - ] - - # Simple input tests - self.assertRaises(ValueError, tokenizer_r.encode, s, max_length=max_length, padding="max_length") - - # Simple input - self.assertRaises(ValueError, tokenizer_r.encode_plus, s, max_length=max_length, padding="max_length") - - # Simple input - self.assertRaises( - ValueError, - tokenizer_r.batch_encode_plus, - s2, - max_length=max_length, - padding="max_length", - ) - - # Pair input - self.assertRaises(ValueError, tokenizer_r.encode, p, max_length=max_length, padding="max_length") - - # Pair input - self.assertRaises(ValueError, tokenizer_r.encode_plus, p, max_length=max_length, padding="max_length") - - # Pair input - self.assertRaises( - ValueError, - tokenizer_r.batch_encode_plus, - p2, - max_length=max_length, - padding="max_length", - ) - - -class SentencePieceFastTokenizerTest(CommonFastTokenizerTest): - """ - Override specific methods to test SentencePiece behavior - """ - - TOKENIZERS_CLASSES = frozenset( - [ - Tokenizer("Albert", AlbertTokenizerFast, AlbertTokenizer, "vocab_file", None, None), - Tokenizer("Camembert", CamembertTokenizerFast, CamembertTokenizer, "vocab_file", None, None), - Tokenizer("T5", T5TokenizerFast, T5Tokenizer, "vocab_file", None, None), - Tokenizer( - "MBart", - MBartTokenizerFast, - MBartTokenizer, - "vocab_file", - None, - None, - ), - Tokenizer("Pegasus", PegasusTokenizerFast, PegasusTokenizer, "vocab_file", None, None), - Tokenizer("Reformer", ReformerTokenizerFast, ReformerTokenizer, "vocab_file", None, None), - Tokenizer("XLMRoberta", XLMRobertaTokenizerFast, XLMRobertaTokenizer, "vocab_file", None, None), - Tokenizer("XLNet", XLNetTokenizerFast, XLNetTokenizer, "vocab_file", None, None), - ] - ) diff --git a/tests/test_tokenization_funnel.py b/tests/test_tokenization_funnel.py index 11945ffc520..b2c9d6fc2e5 100644 --- a/tests/test_tokenization_funnel.py +++ b/tests/test_tokenization_funnel.py @@ -17,14 +17,18 @@ import os import unittest -from transformers.tokenization_funnel import VOCAB_FILES_NAMES, FunnelTokenizer, FunnelTokenizerFast +from transformers import FunnelTokenizer, FunnelTokenizerFast +from transformers.testing_utils import require_tokenizers +from transformers.tokenization_funnel import VOCAB_FILES_NAMES from .test_tokenization_common import TokenizerTesterMixin +@require_tokenizers class FunnelTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = FunnelTokenizer + rust_tokenizer_class = FunnelTokenizerFast test_rust_tokenizer = True space_between_special_tokens = True diff --git a/tests/test_tokenization_gpt2.py b/tests/test_tokenization_gpt2.py index 29420d0b03d..cb479f2e34f 100644 --- a/tests/test_tokenization_gpt2.py +++ b/tests/test_tokenization_gpt2.py @@ -18,16 +18,20 @@ import json import os import unittest -from transformers.tokenization_gpt2 import VOCAB_FILES_NAMES, GPT2Tokenizer, GPT2TokenizerFast +from transformers import GPT2Tokenizer, GPT2TokenizerFast +from transformers.testing_utils import require_tokenizers +from transformers.tokenization_gpt2 import VOCAB_FILES_NAMES from .test_tokenization_common import TokenizerTesterMixin +@require_tokenizers class GPT2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = GPT2Tokenizer rust_tokenizer_class = GPT2TokenizerFast test_rust_tokenizer = True + from_pretrained_kwargs = {"add_prefix_space": True} def setUp(self): super().setUp() @@ -125,3 +129,47 @@ class GPT2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): # It's very difficult to mix/test pretokenization with byte-level # And get both GPT2 and Roberta to work at the same time (mostly an issue of adding a space before the string) pass + + def test_padding(self, max_length=15): + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)): + tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + + # Simple input + s = "This is a simple input" + s2 = ["This is a simple input 1", "This is a simple input 2"] + p = ("This is a simple input", "This is a pair") + p2 = [ + ("This is a simple input 1", "This is a simple input 2"), + ("This is a simple pair 1", "This is a simple pair 2"), + ] + + # Simple input tests + self.assertRaises(ValueError, tokenizer_r.encode, s, max_length=max_length, padding="max_length") + + # Simple input + self.assertRaises(ValueError, tokenizer_r.encode_plus, s, max_length=max_length, padding="max_length") + + # Simple input + self.assertRaises( + ValueError, + tokenizer_r.batch_encode_plus, + s2, + max_length=max_length, + padding="max_length", + ) + + # Pair input + self.assertRaises(ValueError, tokenizer_r.encode, p, max_length=max_length, padding="max_length") + + # Pair input + self.assertRaises(ValueError, tokenizer_r.encode_plus, p, max_length=max_length, padding="max_length") + + # Pair input + self.assertRaises( + ValueError, + tokenizer_r.batch_encode_plus, + p2, + max_length=max_length, + padding="max_length", + ) diff --git a/tests/test_tokenization_herbert.py b/tests/test_tokenization_herbert.py index bf876a05ec3..7565241af7a 100644 --- a/tests/test_tokenization_herbert.py +++ b/tests/test_tokenization_herbert.py @@ -18,12 +18,14 @@ import json import os import unittest -from transformers.testing_utils import slow -from transformers.tokenization_herbert import VOCAB_FILES_NAMES, HerbertTokenizer, HerbertTokenizerFast +from transformers import HerbertTokenizer, HerbertTokenizerFast +from transformers.testing_utils import get_tests_dir, require_tokenizers, slow +from transformers.tokenization_herbert import VOCAB_FILES_NAMES from .test_tokenization_common import TokenizerTesterMixin +@require_tokenizers class HerbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = HerbertTokenizer @@ -33,6 +35,10 @@ class HerbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): def setUp(self): super().setUp() + # Use a simpler test file without japanese/chinese characters + with open(f"{get_tests_dir()}/fixtures/sample_text_no_unicode.txt", encoding="utf-8") as f_data: + self._data = f_data.read().replace("\n\n", "\n").strip() + vocab = [ "", "", diff --git a/tests/test_tokenization_layoutlm.py b/tests/test_tokenization_layoutlm.py index 7361908fed1..654d857ceb9 100644 --- a/tests/test_tokenization_layoutlm.py +++ b/tests/test_tokenization_layoutlm.py @@ -17,14 +17,20 @@ import os import unittest -from transformers.tokenization_layoutlm import VOCAB_FILES_NAMES, LayoutLMTokenizer +from transformers import LayoutLMTokenizer, LayoutLMTokenizerFast +from transformers.testing_utils import require_tokenizers +from transformers.tokenization_layoutlm import VOCAB_FILES_NAMES from .test_tokenization_common import TokenizerTesterMixin +@require_tokenizers class LayoutLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = LayoutLMTokenizer + rust_tokenizer_class = LayoutLMTokenizerFast + test_rust_tokenizer = True + space_between_special_tokens = True def setUp(self): super().setUp() diff --git a/tests/test_tokenization_lxmert.py b/tests/test_tokenization_lxmert.py index 953bca48321..a4677bcb5fe 100644 --- a/tests/test_tokenization_lxmert.py +++ b/tests/test_tokenization_lxmert.py @@ -17,12 +17,14 @@ import os import unittest +from transformers import LxmertTokenizer, LxmertTokenizerFast +from transformers.testing_utils import require_tokenizers from transformers.tokenization_bert import VOCAB_FILES_NAMES -from transformers.tokenization_lxmert import LxmertTokenizer, LxmertTokenizerFast from .test_tokenization_common import TokenizerTesterMixin +@require_tokenizers class LxmertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = LxmertTokenizer diff --git a/tests/test_tokenization_marian.py b/tests/test_tokenization_marian.py index 23bf4bd5190..5759c91b9e1 100644 --- a/tests/test_tokenization_marian.py +++ b/tests/test_tokenization_marian.py @@ -20,9 +20,12 @@ import unittest from pathlib import Path from shutil import copyfile -from transformers.testing_utils import _torch_available -from transformers.tokenization_marian import MarianTokenizer, save_json, vocab_files_names -from transformers.tokenization_utils import BatchEncoding +from transformers import BatchEncoding, MarianTokenizer +from transformers.testing_utils import _sentencepiece_available, _torch_available, require_sentencepiece + + +if _sentencepiece_available: + from transformers.tokenization_marian import save_json, vocab_files_names from .test_tokenization_common import TokenizerTesterMixin @@ -35,6 +38,7 @@ ORG_NAME = "Helsinki-NLP/" FRAMEWORK = "pt" if _torch_available else "tf" +@require_sentencepiece class MarianTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = MarianTokenizer diff --git a/tests/test_tokenization_mbart.py b/tests/test_tokenization_mbart.py index 71b84077d81..171d08880f5 100644 --- a/tests/test_tokenization_mbart.py +++ b/tests/test_tokenization_mbart.py @@ -1,11 +1,26 @@ import tempfile import unittest -from transformers import AutoTokenizer, BatchEncoding, MBartTokenizer, MBartTokenizerFast, is_torch_available -from transformers.testing_utils import require_torch +from transformers import ( + SPIECE_UNDERLINE, + AutoTokenizer, + BatchEncoding, + MBartTokenizer, + MBartTokenizerFast, + is_torch_available, +) +from transformers.testing_utils import ( + _sentencepiece_available, + require_sentencepiece, + require_tokenizers, + require_torch, +) from .test_tokenization_common import TokenizerTesterMixin -from .test_tokenization_xlm_roberta import SAMPLE_VOCAB, SPIECE_UNDERLINE + + +if _sentencepiece_available: + from .test_tokenization_xlm_roberta import SAMPLE_VOCAB if is_torch_available(): @@ -15,6 +30,8 @@ EN_CODE = 250004 RO_CODE = 250020 +@require_sentencepiece +@require_tokenizers class MBartTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = MBartTokenizer rust_tokenizer_class = MBartTokenizerFast @@ -105,6 +122,8 @@ class MBartTokenizationTest(TokenizerTesterMixin, unittest.TestCase): @require_torch +@require_sentencepiece +@require_tokenizers class MBartEnroIntegrationTest(unittest.TestCase): checkpoint_name = "facebook/mbart-large-en-ro" src_text = [ diff --git a/tests/test_tokenization_openai.py b/tests/test_tokenization_openai.py index 88f253d0abd..97f674a428a 100644 --- a/tests/test_tokenization_openai.py +++ b/tests/test_tokenization_openai.py @@ -18,11 +18,14 @@ import json import os import unittest -from transformers.tokenization_openai import VOCAB_FILES_NAMES, OpenAIGPTTokenizer, OpenAIGPTTokenizerFast +from transformers import OpenAIGPTTokenizer, OpenAIGPTTokenizerFast +from transformers.testing_utils import require_tokenizers +from transformers.tokenization_openai import VOCAB_FILES_NAMES from .test_tokenization_common import TokenizerTesterMixin +@require_tokenizers class OpenAIGPTTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = OpenAIGPTTokenizer @@ -80,3 +83,47 @@ class OpenAIGPTTokenizationTest(TokenizerTesterMixin, unittest.TestCase): input_tokens = tokens + [""] input_bpe_tokens = [14, 15, 20] self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) + + def test_padding(self, max_length=15): + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)): + tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + + # Simple input + s = "This is a simple input" + s2 = ["This is a simple input 1", "This is a simple input 2"] + p = ("This is a simple input", "This is a pair") + p2 = [ + ("This is a simple input 1", "This is a simple input 2"), + ("This is a simple pair 1", "This is a simple pair 2"), + ] + + # Simple input tests + self.assertRaises(ValueError, tokenizer_r.encode, s, max_length=max_length, padding="max_length") + + # Simple input + self.assertRaises(ValueError, tokenizer_r.encode_plus, s, max_length=max_length, padding="max_length") + + # Simple input + self.assertRaises( + ValueError, + tokenizer_r.batch_encode_plus, + s2, + max_length=max_length, + padding="max_length", + ) + + # Pair input + self.assertRaises(ValueError, tokenizer_r.encode, p, max_length=max_length, padding="max_length") + + # Pair input + self.assertRaises(ValueError, tokenizer_r.encode_plus, p, max_length=max_length, padding="max_length") + + # Pair input + self.assertRaises( + ValueError, + tokenizer_r.batch_encode_plus, + p2, + max_length=max_length, + padding="max_length", + ) diff --git a/tests/test_tokenization_pegasus.py b/tests/test_tokenization_pegasus.py index ae186ac1f6b..6536220c32c 100644 --- a/tests/test_tokenization_pegasus.py +++ b/tests/test_tokenization_pegasus.py @@ -1,8 +1,8 @@ import unittest +from transformers import PegasusTokenizer, PegasusTokenizerFast from transformers.file_utils import cached_property -from transformers.testing_utils import get_tests_dir, require_torch -from transformers.tokenization_pegasus import PegasusTokenizer, PegasusTokenizerFast +from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, require_torch from .test_tokenization_common import TokenizerTesterMixin @@ -10,6 +10,8 @@ from .test_tokenization_common import TokenizerTesterMixin SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece_no_bos.model") +@require_sentencepiece +@require_tokenizers class PegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = PegasusTokenizer diff --git a/tests/test_tokenization_reformer.py b/tests/test_tokenization_reformer.py index f134958e813..cdad76350a3 100644 --- a/tests/test_tokenization_reformer.py +++ b/tests/test_tokenization_reformer.py @@ -17,9 +17,9 @@ import os import unittest +from transformers import SPIECE_UNDERLINE, ReformerTokenizer, ReformerTokenizerFast from transformers.file_utils import cached_property -from transformers.testing_utils import require_torch, slow -from transformers.tokenization_reformer import SPIECE_UNDERLINE, ReformerTokenizer, ReformerTokenizerFast +from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow from .test_tokenization_common import TokenizerTesterMixin @@ -27,6 +27,8 @@ from .test_tokenization_common import TokenizerTesterMixin SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model") +@require_sentencepiece +@require_tokenizers class ReformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = ReformerTokenizer diff --git a/tests/test_tokenization_roberta.py b/tests/test_tokenization_roberta.py index e96fa58fb91..30d5c41782d 100644 --- a/tests/test_tokenization_roberta.py +++ b/tests/test_tokenization_roberta.py @@ -18,16 +18,19 @@ import json import os import unittest -from transformers.testing_utils import slow -from transformers.tokenization_roberta import VOCAB_FILES_NAMES, AddedToken, RobertaTokenizer, RobertaTokenizerFast +from transformers import AddedToken, RobertaTokenizer, RobertaTokenizerFast +from transformers.testing_utils import require_tokenizers, slow +from transformers.tokenization_roberta import VOCAB_FILES_NAMES from .test_tokenization_common import TokenizerTesterMixin +@require_tokenizers class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = RobertaTokenizer rust_tokenizer_class = RobertaTokenizerFast test_rust_tokenizer = True + from_pretrained_kwargs = {"cls_token": ""} def setUp(self): super().setUp() @@ -158,3 +161,38 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): mask_loc = encoded.index(mask_ind) first_char = tokenizer.convert_ids_to_tokens(encoded[mask_loc + 1])[0] self.assertNotEqual(first_char, space_encoding) + + def test_pretokenized_inputs(self): + pass + + def test_embeded_special_tokens(self): + for tokenizer, pretrained_name, kwargs in self.tokenizers_list: + with self.subTest("{} ({})".format(tokenizer.__class__.__name__, pretrained_name)): + tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs) + tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs) + sentence = "A, AllenNLP sentence." + tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True) + tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True) + + # token_type_ids should put 0 everywhere + self.assertEqual(sum(tokens_r["token_type_ids"]), sum(tokens_p["token_type_ids"])) + + # attention_mask should put 1 everywhere, so sum over length should be 1 + self.assertEqual( + sum(tokens_r["attention_mask"]) / len(tokens_r["attention_mask"]), + sum(tokens_p["attention_mask"]) / len(tokens_p["attention_mask"]), + ) + + tokens_r_str = tokenizer_r.convert_ids_to_tokens(tokens_r["input_ids"]) + tokens_p_str = tokenizer_p.convert_ids_to_tokens(tokens_p["input_ids"]) + + # Rust correctly handles the space before the mask while python doesnt + self.assertSequenceEqual(tokens_p["input_ids"], [0, 250, 6, 50264, 3823, 487, 21992, 3645, 4, 2]) + self.assertSequenceEqual(tokens_r["input_ids"], [0, 250, 6, 50264, 3823, 487, 21992, 3645, 4, 2]) + + self.assertSequenceEqual( + tokens_p_str, ["", "A", ",", "", "Ä Allen", "N", "LP", "Ä sentence", ".", ""] + ) + self.assertSequenceEqual( + tokens_r_str, ["", "A", ",", "", "Ä Allen", "N", "LP", "Ä sentence", ".", ""] + ) diff --git a/tests/test_tokenization_squeezebert.py b/tests/test_tokenization_squeezebert.py index f8a8709feeb..3637717a0c7 100644 --- a/tests/test_tokenization_squeezebert.py +++ b/tests/test_tokenization_squeezebert.py @@ -14,15 +14,18 @@ # limitations under the License. -from transformers.testing_utils import slow -from transformers.tokenization_squeezebert import SqueezeBertTokenizer, SqueezeBertTokenizerFast +from transformers import SqueezeBertTokenizer, SqueezeBertTokenizerFast +from transformers.testing_utils import require_tokenizers, slow from .test_tokenization_bert import BertTokenizationTest +@require_tokenizers class SqueezeBertTokenizationTest(BertTokenizationTest): tokenizer_class = SqueezeBertTokenizer + rust_tokenizer_class = SqueezeBertTokenizerFast + test_rust_tokenizer = True def get_rust_tokenizer(self, **kwargs): return SqueezeBertTokenizerFast.from_pretrained(self.tmpdirname, **kwargs) diff --git a/tests/test_tokenization_t5.py b/tests/test_tokenization_t5.py index 9b670478cfc..5be6fdfdffd 100644 --- a/tests/test_tokenization_t5.py +++ b/tests/test_tokenization_t5.py @@ -16,11 +16,9 @@ import unittest -from transformers import BatchEncoding +from transformers import SPIECE_UNDERLINE, BatchEncoding, T5Tokenizer, T5TokenizerFast from transformers.file_utils import cached_property -from transformers.testing_utils import _torch_available, get_tests_dir -from transformers.tokenization_t5 import T5Tokenizer, T5TokenizerFast -from transformers.tokenization_xlnet import SPIECE_UNDERLINE +from transformers.testing_utils import _torch_available, get_tests_dir, require_sentencepiece, require_tokenizers from .test_tokenization_common import TokenizerTesterMixin @@ -30,6 +28,8 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") FRAMEWORK = "pt" if _torch_available else "tf" +@require_sentencepiece +@require_tokenizers class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = T5Tokenizer diff --git a/tests/test_tokenization_utils.py b/tests/test_tokenization_utils.py index 564d8798767..d2b1e69f006 100644 --- a/tests/test_tokenization_utils.py +++ b/tests/test_tokenization_utils.py @@ -19,7 +19,7 @@ from typing import Callable, Optional import numpy as np from transformers import BatchEncoding, BertTokenizer, BertTokenizerFast, PreTrainedTokenizer, TensorType -from transformers.testing_utils import require_tf, require_torch, slow +from transformers.testing_utils import require_tf, require_tokenizers, require_torch, slow from transformers.tokenization_gpt2 import GPT2Tokenizer @@ -68,6 +68,7 @@ class TokenizerUtilsTest(unittest.TestCase): self.assertEqual(TensorType("pt"), TensorType.PYTORCH) self.assertEqual(TensorType("np"), TensorType.NUMPY) + @require_tokenizers def test_batch_encoding_pickle(self): import numpy as np @@ -92,6 +93,7 @@ class TokenizerUtilsTest(unittest.TestCase): ) @require_tf + @require_tokenizers def test_batch_encoding_pickle_tf(self): import tensorflow as tf @@ -112,6 +114,7 @@ class TokenizerUtilsTest(unittest.TestCase): ) @require_torch + @require_tokenizers def test_batch_encoding_pickle_pt(self): import torch @@ -128,6 +131,7 @@ class TokenizerUtilsTest(unittest.TestCase): tokenizer_r("Small example to encode", return_tensors=TensorType.PYTORCH), torch.equal ) + @require_tokenizers def test_batch_encoding_is_fast(self): tokenizer_p = BertTokenizer.from_pretrained("bert-base-cased") tokenizer_r = BertTokenizerFast.from_pretrained("bert-base-cased") diff --git a/tests/test_tokenization_xlm_roberta.py b/tests/test_tokenization_xlm_roberta.py index 1b64e0091e4..39c985b7a96 100644 --- a/tests/test_tokenization_xlm_roberta.py +++ b/tests/test_tokenization_xlm_roberta.py @@ -17,9 +17,9 @@ import os import unittest +from transformers import SPIECE_UNDERLINE, XLMRobertaTokenizer, XLMRobertaTokenizerFast from transformers.file_utils import cached_property -from transformers.testing_utils import slow -from transformers.tokenization_xlm_roberta import SPIECE_UNDERLINE, XLMRobertaTokenizer, XLMRobertaTokenizerFast +from transformers.testing_utils import require_sentencepiece, require_tokenizers, slow from .test_tokenization_common import TokenizerTesterMixin @@ -27,6 +27,8 @@ from .test_tokenization_common import TokenizerTesterMixin SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model") +@require_sentencepiece +@require_tokenizers class XLMRobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = XLMRobertaTokenizer diff --git a/tests/test_tokenization_xlnet.py b/tests/test_tokenization_xlnet.py index d0ee0da26f8..550ef559628 100644 --- a/tests/test_tokenization_xlnet.py +++ b/tests/test_tokenization_xlnet.py @@ -17,8 +17,8 @@ import os import unittest -from transformers.testing_utils import slow -from transformers.tokenization_xlnet import SPIECE_UNDERLINE, XLNetTokenizer, XLNetTokenizerFast +from transformers import SPIECE_UNDERLINE, XLNetTokenizer, XLNetTokenizerFast +from transformers.testing_utils import require_sentencepiece, require_tokenizers, slow from .test_tokenization_common import TokenizerTesterMixin @@ -26,6 +26,8 @@ from .test_tokenization_common import TokenizerTesterMixin SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model") +@require_sentencepiece +@require_tokenizers class XLNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = XLNetTokenizer diff --git a/tests/test_trainer.py b/tests/test_trainer.py index 2c3b7f4a02d..019f29a35b4 100755 --- a/tests/test_trainer.py +++ b/tests/test_trainer.py @@ -23,7 +23,7 @@ import numpy as np from transformers import AutoTokenizer, PretrainedConfig, TrainingArguments, is_torch_available from transformers.file_utils import WEIGHTS_NAME -from transformers.testing_utils import get_tests_dir, require_torch, slow +from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, require_torch, slow if is_torch_available(): @@ -151,6 +151,8 @@ if is_torch_available(): @require_torch +@require_sentencepiece +@require_tokenizers class TrainerIntegrationTest(unittest.TestCase): def setUp(self): args = TrainingArguments(".") diff --git a/utils/check_dummies.py b/utils/check_dummies.py index 7a18d5e0dbb..ad1de4fa6ae 100644 --- a/utils/check_dummies.py +++ b/utils/check_dummies.py @@ -49,6 +49,7 @@ def {0}(*args, **kwargs): requires_pytorch({0}) """ + DUMMY_TF_PRETRAINED_CLASS = """ class {0}: def __init__(self, *args, **kwargs): @@ -71,12 +72,111 @@ def {0}(*args, **kwargs): """ +DUMMY_SENTENCEPIECE_PRETRAINED_CLASS = """ +class {0}: + def __init__(self, *args, **kwargs): + requires_sentencepiece(self) + + @classmethod + def from_pretrained(self, *args, **kwargs): + requires_sentencepiece(self) +""" + +DUMMY_SENTENCEPIECE_CLASS = """ +class {0}: + def __init__(self, *args, **kwargs): + requires_sentencepiece(self) +""" + +DUMMY_SENTENCEPIECE_FUNCTION = """ +def {0}(*args, **kwargs): + requires_sentencepiece({0}) +""" + + +DUMMY_TOKENIZERS_PRETRAINED_CLASS = """ +class {0}: + def __init__(self, *args, **kwargs): + requires_tokenizers(self) + + @classmethod + def from_pretrained(self, *args, **kwargs): + requires_tokenizers(self) +""" + +DUMMY_TOKENIZERS_CLASS = """ +class {0}: + def __init__(self, *args, **kwargs): + requires_tokenizers(self) +""" + +DUMMY_TOKENIZERS_FUNCTION = """ +def {0}(*args, **kwargs): + requires_tokenizers({0}) +""" + +# Map all these to dummy type + +DUMMY_PRETRAINED_CLASS = { + "pt": DUMMY_PT_PRETRAINED_CLASS, + "tf": DUMMY_TF_PRETRAINED_CLASS, + "sentencepiece": DUMMY_SENTENCEPIECE_PRETRAINED_CLASS, + "tokenizers": DUMMY_TOKENIZERS_PRETRAINED_CLASS, +} + +DUMMY_CLASS = { + "pt": DUMMY_PT_CLASS, + "tf": DUMMY_TF_CLASS, + "sentencepiece": DUMMY_SENTENCEPIECE_CLASS, + "tokenizers": DUMMY_TOKENIZERS_CLASS, +} + +DUMMY_FUNCTION = { + "pt": DUMMY_PT_FUNCTION, + "tf": DUMMY_TF_FUNCTION, + "sentencepiece": DUMMY_SENTENCEPIECE_FUNCTION, + "tokenizers": DUMMY_TOKENIZERS_FUNCTION, +} + + def read_init(): - """ Read the init and exctracts PyTorch and TensorFlow objects. """ + """ Read the init and exctracts PyTorch, TensorFlow, SentencePiece and Tokenizers objects. """ with open(os.path.join(PATH_TO_TRANSFORMERS, "__init__.py"), "r", encoding="utf-8") as f: lines = f.readlines() line_index = 0 + # Find where the SentencePiece imports begin + sentencepiece_objects = [] + while not lines[line_index].startswith("if is_sentencepiece_available():"): + line_index += 1 + line_index += 1 + + # Until we unindent, add SentencePiece objects to the list + while len(lines[line_index]) <= 1 or lines[line_index].startswith(" "): + line = lines[line_index] + search = _re_single_line_import.search(line) + if search is not None: + sentencepiece_objects += search.groups()[0].split(", ") + elif line.startswith(" "): + sentencepiece_objects.append(line[8:-2]) + line_index += 1 + + # Find where the Tokenizers imports begin + tokenizers_objects = [] + while not lines[line_index].startswith("if is_tokenizers_available():"): + line_index += 1 + line_index += 1 + + # Until we unindent, add Tokenizers objects to the list + while len(lines[line_index]) <= 1 or lines[line_index].startswith(" "): + line = lines[line_index] + search = _re_single_line_import.search(line) + if search is not None: + tokenizers_objects += search.groups()[0].split(", ") + elif line.startswith(" "): + tokenizers_objects.append(line[8:-2]) + line_index += 1 + # Find where the PyTorch imports begin pt_objects = [] while not lines[line_index].startswith("if is_torch_available():"): @@ -108,10 +208,10 @@ def read_init(): elif line.startswith(" "): tf_objects.append(line[8:-2]) line_index += 1 - return pt_objects, tf_objects + return sentencepiece_objects, tokenizers_objects, pt_objects, tf_objects -def create_dummy_object(name, is_pytorch=True): +def create_dummy_object(name, type="pt"): """ Create the code for the dummy object corresponding to `name`.""" _pretrained = [ "Config" "ForCausalLM", @@ -124,10 +224,11 @@ def create_dummy_object(name, is_pytorch=True): "Model", "Tokenizer", ] + assert type in ["pt", "tf", "sentencepiece", "tokenizers"] if name.isupper(): return DUMMY_CONSTANT.format(name) elif name.islower(): - return (DUMMY_PT_FUNCTION if is_pytorch else DUMMY_TF_FUNCTION).format(name) + return (DUMMY_FUNCTION[type]).format(name) else: is_pretrained = False for part in _pretrained: @@ -135,39 +236,75 @@ def create_dummy_object(name, is_pytorch=True): is_pretrained = True break if is_pretrained: - template = DUMMY_PT_PRETRAINED_CLASS if is_pytorch else DUMMY_TF_PRETRAINED_CLASS + template = DUMMY_PRETRAINED_CLASS[type] else: - template = DUMMY_PT_CLASS if is_pytorch else DUMMY_TF_CLASS + template = DUMMY_CLASS[type] return template.format(name) def create_dummy_files(): """ Create the content of the dummy files. """ - pt_objects, tf_objects = read_init() + sentencepiece_objects, tokenizers_objects, pt_objects, tf_objects = read_init() + + sentencepiece_dummies = "# This file is autogenerated by the command `make fix-copies`, do not edit.\n" + sentencepiece_dummies += "from ..file_utils import requires_sentencepiece\n\n" + sentencepiece_dummies += "\n".join([create_dummy_object(o, type="sentencepiece") for o in sentencepiece_objects]) + + tokenizers_dummies = "# This file is autogenerated by the command `make fix-copies`, do not edit.\n" + tokenizers_dummies += "from ..file_utils import requires_tokenizers\n\n" + tokenizers_dummies += "\n".join([create_dummy_object(o, type="tokenizers") for o in tokenizers_objects]) pt_dummies = "# This file is autogenerated by the command `make fix-copies`, do not edit.\n" pt_dummies += "from ..file_utils import requires_pytorch\n\n" - pt_dummies += "\n".join([create_dummy_object(o) for o in pt_objects]) + pt_dummies += "\n".join([create_dummy_object(o, type="pt") for o in pt_objects]) tf_dummies = "# This file is autogenerated by the command `make fix-copies`, do not edit.\n" tf_dummies += "from ..file_utils import requires_tf\n\n" - tf_dummies += "\n".join([create_dummy_object(o, False) for o in tf_objects]) + tf_dummies += "\n".join([create_dummy_object(o, type="tf") for o in tf_objects]) - return pt_dummies, tf_dummies + return sentencepiece_dummies, tokenizers_dummies, pt_dummies, tf_dummies def check_dummies(overwrite=False): """ Check if the dummy files are up to date and maybe `overwrite` with the right content. """ - pt_dummies, tf_dummies = create_dummy_files() + sentencepiece_dummies, tokenizers_dummies, pt_dummies, tf_dummies = create_dummy_files() path = os.path.join(PATH_TO_TRANSFORMERS, "utils") + sentencepiece_file = os.path.join(path, "dummy_sentencepiece_objects.py") + tokenizers_file = os.path.join(path, "dummy_tokenizers_objects.py") pt_file = os.path.join(path, "dummy_pt_objects.py") tf_file = os.path.join(path, "dummy_tf_objects.py") + with open(sentencepiece_file, "r", encoding="utf-8") as f: + actual_sentencepiece_dummies = f.read() + with open(tokenizers_file, "r", encoding="utf-8") as f: + actual_tokenizers_dummies = f.read() with open(pt_file, "r", encoding="utf-8") as f: actual_pt_dummies = f.read() with open(tf_file, "r", encoding="utf-8") as f: actual_tf_dummies = f.read() + if sentencepiece_dummies != actual_sentencepiece_dummies: + if overwrite: + print("Updating transformers.utils.dummy_sentencepiece_objects.py as the main __init__ has new objects.") + with open(sentencepiece_file, "w", encoding="utf-8") as f: + f.write(sentencepiece_dummies) + else: + raise ValueError( + "The main __init__ has objects that are not present in transformers.utils.dummy_sentencepiece_objects.py.", + "Run `make fix-copies` to fix this.", + ) + + if tokenizers_dummies != actual_tokenizers_dummies: + if overwrite: + print("Updating transformers.utils.dummy_tokenizers_objects.py as the main __init__ has new objects.") + with open(tokenizers_file, "w", encoding="utf-8") as f: + f.write(tokenizers_dummies) + else: + raise ValueError( + "The main __init__ has objects that are not present in transformers.utils.dummy_tokenizers_objects.py.", + "Run `make fix-copies` to fix this.", + ) + if pt_dummies != actual_pt_dummies: if overwrite: print("Updating transformers.utils.dummy_pt_objects.py as the main __init__ has new objects.")