mirror of
https://github.com/huggingface/transformers.git
synced 2025-08-03 03:31:05 +06:00

* Logging * Style * hf_logging > utils.logging * Address @thomwolf's comments * Update test * Update src/transformers/benchmark/benchmark_utils.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Revert bad change Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
780 lines
32 KiB
Python
780 lines
32 KiB
Python
# coding=utf-8
|
|
# Copyright 2020 The HuggingFace Inc. team.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
""" Tokenization classes for python tokenizers.
|
|
For fast tokenizers (provided by HuggingFace's tokenizers library) see tokenization_utils_fast.py
|
|
"""
|
|
|
|
import itertools
|
|
import re
|
|
import unicodedata
|
|
from typing import Any, Dict, List, Optional, Tuple, Union, overload
|
|
|
|
from .file_utils import add_end_docstrings
|
|
from .tokenization_utils_base import (
|
|
ENCODE_KWARGS_DOCSTRING,
|
|
ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING,
|
|
INIT_TOKENIZER_DOCSTRING,
|
|
AddedToken,
|
|
BatchEncoding,
|
|
EncodedInput,
|
|
EncodedInputPair,
|
|
PaddingStrategy,
|
|
PreTokenizedInput,
|
|
PreTokenizedInputPair,
|
|
PreTrainedTokenizerBase,
|
|
TensorType,
|
|
TextInput,
|
|
TextInputPair,
|
|
TruncationStrategy,
|
|
)
|
|
from .utils import logging
|
|
|
|
|
|
logger = logging.get_logger(__name__)
|
|
|
|
|
|
def _is_whitespace(char):
|
|
"""Checks whether `char` is a whitespace character."""
|
|
# \t, \n, and \r are technically contorl characters but we treat them
|
|
# as whitespace since they are generally considered as such.
|
|
if char == " " or char == "\t" or char == "\n" or char == "\r":
|
|
return True
|
|
cat = unicodedata.category(char)
|
|
if cat == "Zs":
|
|
return True
|
|
return False
|
|
|
|
|
|
def _is_control(char):
|
|
"""Checks whether `char` is a control character."""
|
|
# These are technically control characters but we count them as whitespace
|
|
# characters.
|
|
if char == "\t" or char == "\n" or char == "\r":
|
|
return False
|
|
cat = unicodedata.category(char)
|
|
if cat.startswith("C"):
|
|
return True
|
|
return False
|
|
|
|
|
|
def _is_punctuation(char):
|
|
"""Checks whether `char` is a punctuation character."""
|
|
cp = ord(char)
|
|
# We treat all non-letter/number ASCII as punctuation.
|
|
# Characters such as "^", "$", and "`" are not in the Unicode
|
|
# Punctuation class but we treat them as punctuation anyways, for
|
|
# consistency.
|
|
if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126):
|
|
return True
|
|
cat = unicodedata.category(char)
|
|
if cat.startswith("P"):
|
|
return True
|
|
return False
|
|
|
|
|
|
def _is_end_of_word(text):
|
|
"""Checks whether the last character in text is one of a punctuation, control or whitespace character."""
|
|
last_char = text[-1]
|
|
return bool(_is_control(last_char) | _is_punctuation(last_char) | _is_whitespace(last_char))
|
|
|
|
|
|
def _is_start_of_word(text):
|
|
"""Checks whether the first character in text is one of a punctuation, control or whitespace character."""
|
|
first_char = text[0]
|
|
return bool(_is_control(first_char) | _is_punctuation(first_char) | _is_whitespace(first_char))
|
|
|
|
|
|
@add_end_docstrings(INIT_TOKENIZER_DOCSTRING, """ .. automethod:: __call__""")
|
|
class PreTrainedTokenizer(PreTrainedTokenizerBase):
|
|
"""
|
|
Base class for all slow tokenizers.
|
|
|
|
Inherits from :class:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase`.
|
|
|
|
Handle all the shared methods for tokenization and special tokens as well as methods
|
|
downloading/caching/loading pretrained tokenizers as well as adding tokens to the vocabulary.
|
|
|
|
This class also contain the added tokens in a unified way on top of all tokenizers so we don't
|
|
have to handle the specific vocabulary augmentation methods of the various underlying
|
|
dictionary structures (BPE, sentencepiece...).
|
|
"""
|
|
|
|
def __init__(self, **kwargs):
|
|
super().__init__(**kwargs)
|
|
|
|
# Added tokens - We store this for both slow and fast tokenizers
|
|
# until the serialization of Fast tokenizers is updated
|
|
self.added_tokens_encoder: Dict[str, int] = {}
|
|
self.added_tokens_decoder: Dict[int, str] = {}
|
|
self.unique_no_split_tokens: List[str] = []
|
|
|
|
@property
|
|
def is_fast(self) -> bool:
|
|
return False
|
|
|
|
@property
|
|
def vocab_size(self) -> int:
|
|
"""
|
|
:obj:`int`: Size of the base vocabulary (without the added tokens).
|
|
"""
|
|
raise NotImplementedError
|
|
|
|
def get_vocab(self) -> Dict[str, int]:
|
|
"""
|
|
Returns the vocabulary as a dictionary of token to index.
|
|
|
|
:obj:`tokenizer.get_vocab()[token]` is equivalent to :obj:`tokenizer.convert_tokens_to_ids(token)` when
|
|
:obj:`token` is in the vocab.
|
|
|
|
Returns:
|
|
:obj:`Dict[str, int]`: The vocabulary.
|
|
"""
|
|
raise NotImplementedError()
|
|
|
|
def get_added_vocab(self) -> Dict[str, int]:
|
|
"""
|
|
Returns the added tokens in the vocabulary as a dictionary of token to index.
|
|
|
|
Returns:
|
|
:obj:`Dict[str, int]`: The added tokens.
|
|
"""
|
|
return self.added_tokens_encoder
|
|
|
|
def __len__(self):
|
|
"""
|
|
Size of the full vocabulary with the added tokens.
|
|
"""
|
|
return self.vocab_size + len(self.added_tokens_encoder)
|
|
|
|
def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
|
|
"""
|
|
Add a list of new tokens to the tokenizer class. If the new tokens are not in the
|
|
vocabulary, they are added to it with indices starting from length of the current vocabulary.
|
|
|
|
Args:
|
|
new_tokens (:obj:`List[str]`or :obj:`List[tokenizers.AddedToken]`):
|
|
Token(s) to add in vocabulary. A token is only added if it's not already in the vocabulary (tested by
|
|
checking if the tokenizer assign the index of the ``unk_token`` to them).
|
|
special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
|
Whether or not the tokens should be added as special tokens.
|
|
|
|
Returns:
|
|
:obj:`int`: The number of tokens actually added to the vocabulary.
|
|
|
|
Examples::
|
|
|
|
# Let's see how to increase the vocabulary of Bert model and tokenizer
|
|
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
|
model = BertModel.from_pretrained('bert-base-uncased')
|
|
|
|
num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2'])
|
|
print('We have added', num_added_toks, 'tokens')
|
|
# Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
|
|
model.resize_token_embeddings(len(tokenizer))
|
|
"""
|
|
new_tokens = [str(tok) for tok in new_tokens]
|
|
|
|
tokens_to_add = []
|
|
for token in new_tokens:
|
|
assert isinstance(token, str)
|
|
if not special_tokens and self.init_kwargs.get("do_lower_case", False):
|
|
token = token.lower()
|
|
if (
|
|
token != self.unk_token
|
|
and self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token)
|
|
and token not in tokens_to_add
|
|
):
|
|
tokens_to_add.append(token)
|
|
if self.verbose:
|
|
logger.info("Adding %s to the vocabulary", token)
|
|
|
|
added_tok_encoder = dict((tok, len(self) + i) for i, tok in enumerate(tokens_to_add))
|
|
added_tok_decoder = {v: k for k, v in added_tok_encoder.items()}
|
|
self.added_tokens_encoder.update(added_tok_encoder)
|
|
self.added_tokens_decoder.update(added_tok_decoder)
|
|
|
|
# Make sure we don't split on any special tokens (even they were already in the vocab before e.g. for Albert)
|
|
if special_tokens:
|
|
self.unique_no_split_tokens = sorted(set(self.unique_no_split_tokens).union(set(new_tokens)))
|
|
else:
|
|
# Or on the newly added tokens
|
|
self.unique_no_split_tokens = sorted(set(self.unique_no_split_tokens).union(set(tokens_to_add)))
|
|
|
|
return len(tokens_to_add)
|
|
|
|
def num_special_tokens_to_add(self, pair: bool = False) -> int:
|
|
"""
|
|
Returns the number of added tokens when encoding a sequence with special tokens.
|
|
|
|
.. note::
|
|
This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not
|
|
put this inside your training loop.
|
|
|
|
Args:
|
|
pair (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
|
Whether the number of added tokens should be computed in the case of a sequence pair or a single
|
|
sequence.
|
|
|
|
Returns:
|
|
:obj:`int`: Number of special tokens added to sequences.
|
|
"""
|
|
token_ids_0 = []
|
|
token_ids_1 = []
|
|
return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None))
|
|
|
|
def tokenize(self, text: TextInput, **kwargs) -> List[str]:
|
|
"""
|
|
Converts a string in a sequence of tokens, using the tokenizer.
|
|
|
|
Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
|
|
Takes care of added tokens.
|
|
|
|
Args:
|
|
text (:obj:`str`):
|
|
The sequence to be encoded.
|
|
**kwargs (additional keyword arguments):
|
|
Passed along to the model-specific ``prepare_for_tokenization`` preprocessing method.
|
|
|
|
Returns:
|
|
:obj:`List[str]`: The list of tokens.
|
|
"""
|
|
# Simple mapping string => AddedToken for special tokens with specific tokenization behaviors
|
|
all_special_tokens_extended = dict(
|
|
(str(t), t) for t in self.all_special_tokens_extended if isinstance(t, AddedToken)
|
|
)
|
|
|
|
text, kwargs = self.prepare_for_tokenization(text, **kwargs)
|
|
|
|
if kwargs:
|
|
logger.warning(f"Keyword arguments {kwargs} not recognized.")
|
|
|
|
# TODO: should this be in the base class?
|
|
if self.init_kwargs.get("do_lower_case", False):
|
|
# convert non-special tokens to lowercase
|
|
escaped_special_toks = [re.escape(s_tok) for s_tok in self.all_special_tokens]
|
|
pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)"
|
|
text = re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), text)
|
|
|
|
def split_on_token(tok, text):
|
|
result = []
|
|
tok_extended = all_special_tokens_extended.get(tok, None)
|
|
split_text = text.split(tok)
|
|
full_word = ""
|
|
for i, sub_text in enumerate(split_text):
|
|
# AddedToken can control whitespace stripping around them.
|
|
# We use them for GPT2 and Roberta to have different behavior depending on the special token
|
|
# Cf. https://github.com/huggingface/transformers/pull/2778
|
|
# and https://github.com/huggingface/transformers/issues/3788
|
|
if isinstance(tok_extended, AddedToken):
|
|
if tok_extended.single_word:
|
|
# Try to avoid splitting on token
|
|
if (
|
|
i < len(split_text) - 1
|
|
and not _is_end_of_word(sub_text)
|
|
and not _is_start_of_word(split_text[i + 1])
|
|
):
|
|
# Don't extract the special token
|
|
full_word += sub_text + tok
|
|
elif full_word:
|
|
full_word += sub_text
|
|
result += [full_word]
|
|
full_word = ""
|
|
continue
|
|
# Strip white spaces on the right
|
|
if tok_extended.rstrip and i > 0:
|
|
# A bit counter-intuitive but we strip the left of the string
|
|
# since tok_extended.rstrip means the special token is eating all white spaces on its right
|
|
sub_text = sub_text.lstrip()
|
|
# Strip white spaces on the left
|
|
if tok_extended.lstrip and i < len(split_text) - 1:
|
|
sub_text = sub_text.rstrip() # Opposite here
|
|
else:
|
|
# We strip left and right by default
|
|
if i < len(split_text) - 1:
|
|
sub_text = sub_text.rstrip()
|
|
if i > 0:
|
|
sub_text = sub_text.lstrip()
|
|
|
|
if i == 0 and not sub_text:
|
|
result += [tok]
|
|
elif i == len(split_text) - 1:
|
|
if sub_text:
|
|
result += [sub_text]
|
|
else:
|
|
pass
|
|
else:
|
|
if sub_text:
|
|
result += [sub_text]
|
|
result += [tok]
|
|
return result
|
|
|
|
def split_on_tokens(tok_list, text):
|
|
if not text.strip():
|
|
return []
|
|
if not tok_list:
|
|
return self._tokenize(text)
|
|
|
|
tokenized_text = []
|
|
text_list = [text]
|
|
for tok in tok_list:
|
|
tokenized_text = []
|
|
for sub_text in text_list:
|
|
if sub_text not in self.unique_no_split_tokens:
|
|
tokenized_text += split_on_token(tok, sub_text)
|
|
else:
|
|
tokenized_text += [sub_text]
|
|
text_list = tokenized_text
|
|
|
|
return list(
|
|
itertools.chain.from_iterable(
|
|
(
|
|
self._tokenize(token) if token not in self.unique_no_split_tokens else [token]
|
|
for token in tokenized_text
|
|
)
|
|
)
|
|
)
|
|
|
|
no_split_token = self.unique_no_split_tokens
|
|
tokenized_text = split_on_tokens(no_split_token, text)
|
|
return tokenized_text
|
|
|
|
def _tokenize(self, text, **kwargs):
|
|
"""
|
|
Converts a string in a sequence of tokens (string), using the tokenizer.
|
|
Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies
|
|
(BPE/SentencePieces/WordPieces).
|
|
|
|
Do NOT take care of added tokens.
|
|
"""
|
|
raise NotImplementedError
|
|
|
|
def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
|
|
"""
|
|
Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the
|
|
vocabulary.
|
|
|
|
Args:
|
|
token (:obj:`str` or :obj:`List[str]`): One or several token(s) to convert to token id(s).
|
|
|
|
Returns:
|
|
:obj:`int` or :obj:`List[int]`: The token id or list of token ids.
|
|
"""
|
|
if tokens is None:
|
|
return None
|
|
|
|
if isinstance(tokens, str):
|
|
return self._convert_token_to_id_with_added_voc(tokens)
|
|
|
|
ids = []
|
|
for token in tokens:
|
|
ids.append(self._convert_token_to_id_with_added_voc(token))
|
|
return ids
|
|
|
|
def _convert_token_to_id_with_added_voc(self, token):
|
|
if token is None:
|
|
return None
|
|
|
|
if token in self.added_tokens_encoder:
|
|
return self.added_tokens_encoder[token]
|
|
return self._convert_token_to_id(token)
|
|
|
|
def _convert_token_to_id(self, token):
|
|
raise NotImplementedError
|
|
|
|
def _encode_plus(
|
|
self,
|
|
text: Union[TextInput, PreTokenizedInput, EncodedInput],
|
|
text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
|
|
add_special_tokens: bool = True,
|
|
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
|
|
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
|
|
max_length: Optional[int] = None,
|
|
stride: int = 0,
|
|
is_pretokenized: bool = False,
|
|
pad_to_multiple_of: Optional[int] = None,
|
|
return_tensors: Optional[Union[str, TensorType]] = None,
|
|
return_token_type_ids: Optional[bool] = None,
|
|
return_attention_mask: Optional[bool] = None,
|
|
return_overflowing_tokens: bool = False,
|
|
return_special_tokens_mask: bool = False,
|
|
return_offsets_mapping: bool = False,
|
|
return_length: bool = False,
|
|
verbose: bool = True,
|
|
**kwargs
|
|
) -> BatchEncoding:
|
|
def get_input_ids(text):
|
|
if isinstance(text, str):
|
|
tokens = self.tokenize(text, **kwargs)
|
|
return self.convert_tokens_to_ids(tokens)
|
|
elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
|
|
if is_pretokenized:
|
|
tokens = list(itertools.chain(*(self.tokenize(t, is_pretokenized=True, **kwargs) for t in text)))
|
|
return self.convert_tokens_to_ids(tokens)
|
|
else:
|
|
return self.convert_tokens_to_ids(text)
|
|
elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
|
|
return text
|
|
else:
|
|
if is_pretokenized:
|
|
raise ValueError(
|
|
f"Input {text} is not valid. Should be a string or a list/tuple of strings when `is_pretokenized=True`."
|
|
)
|
|
else:
|
|
raise ValueError(
|
|
f"Input {text} is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
|
|
)
|
|
|
|
if return_offsets_mapping:
|
|
raise NotImplementedError(
|
|
"return_offset_mapping is not available when using Python tokenizers."
|
|
"To use this feature, change your tokenizer to one deriving from "
|
|
"transformers.PreTrainedTokenizerFast."
|
|
"More information on available tokenizers at "
|
|
"https://github.com/huggingface/transformers/pull/2674"
|
|
)
|
|
|
|
first_ids = get_input_ids(text)
|
|
second_ids = get_input_ids(text_pair) if text_pair is not None else None
|
|
|
|
return self.prepare_for_model(
|
|
first_ids,
|
|
pair_ids=second_ids,
|
|
add_special_tokens=add_special_tokens,
|
|
padding=padding_strategy.value,
|
|
truncation=truncation_strategy.value,
|
|
max_length=max_length,
|
|
stride=stride,
|
|
pad_to_multiple_of=pad_to_multiple_of,
|
|
return_tensors=return_tensors,
|
|
prepend_batch_axis=True,
|
|
return_attention_mask=return_attention_mask,
|
|
return_token_type_ids=return_token_type_ids,
|
|
return_overflowing_tokens=return_overflowing_tokens,
|
|
return_special_tokens_mask=return_special_tokens_mask,
|
|
return_length=return_length,
|
|
verbose=verbose,
|
|
)
|
|
|
|
def _batch_encode_plus(
|
|
self,
|
|
batch_text_or_text_pairs: Union[
|
|
List[TextInput],
|
|
List[TextInputPair],
|
|
List[PreTokenizedInput],
|
|
List[PreTokenizedInputPair],
|
|
List[EncodedInput],
|
|
List[EncodedInputPair],
|
|
],
|
|
add_special_tokens: bool = True,
|
|
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
|
|
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
|
|
max_length: Optional[int] = None,
|
|
stride: int = 0,
|
|
is_pretokenized: bool = False,
|
|
pad_to_multiple_of: Optional[int] = None,
|
|
return_tensors: Optional[Union[str, TensorType]] = None,
|
|
return_token_type_ids: Optional[bool] = None,
|
|
return_attention_mask: Optional[bool] = None,
|
|
return_overflowing_tokens: bool = False,
|
|
return_special_tokens_mask: bool = False,
|
|
return_offsets_mapping: bool = False,
|
|
return_length: bool = False,
|
|
verbose: bool = True,
|
|
**kwargs
|
|
) -> BatchEncoding:
|
|
def get_input_ids(text):
|
|
if isinstance(text, str):
|
|
tokens = self.tokenize(text, **kwargs)
|
|
return self.convert_tokens_to_ids(tokens)
|
|
elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
|
|
if is_pretokenized:
|
|
tokens = list(itertools.chain(*(self.tokenize(t, is_pretokenized=True, **kwargs) for t in text)))
|
|
return self.convert_tokens_to_ids(tokens)
|
|
else:
|
|
return self.convert_tokens_to_ids(text)
|
|
elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
|
|
return text
|
|
else:
|
|
raise ValueError(
|
|
"Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
|
|
)
|
|
|
|
if return_offsets_mapping:
|
|
raise NotImplementedError(
|
|
"return_offset_mapping is not available when using Python tokenizers."
|
|
"To use this feature, change your tokenizer to one deriving from "
|
|
"transformers.PreTrainedTokenizerFast."
|
|
)
|
|
|
|
input_ids = []
|
|
for ids_or_pair_ids in batch_text_or_text_pairs:
|
|
if not isinstance(ids_or_pair_ids, (list, tuple)):
|
|
ids, pair_ids = ids_or_pair_ids, None
|
|
elif is_pretokenized and not isinstance(ids_or_pair_ids[0], (list, tuple)):
|
|
ids, pair_ids = ids_or_pair_ids, None
|
|
else:
|
|
ids, pair_ids = ids_or_pair_ids
|
|
|
|
first_ids = get_input_ids(ids)
|
|
second_ids = get_input_ids(pair_ids) if pair_ids is not None else None
|
|
input_ids.append((first_ids, second_ids))
|
|
|
|
batch_outputs = self._batch_prepare_for_model(
|
|
input_ids,
|
|
add_special_tokens=add_special_tokens,
|
|
padding_strategy=padding_strategy,
|
|
truncation_strategy=truncation_strategy,
|
|
max_length=max_length,
|
|
stride=stride,
|
|
pad_to_multiple_of=pad_to_multiple_of,
|
|
return_attention_mask=return_attention_mask,
|
|
return_token_type_ids=return_token_type_ids,
|
|
return_overflowing_tokens=return_overflowing_tokens,
|
|
return_special_tokens_mask=return_special_tokens_mask,
|
|
return_length=return_length,
|
|
return_tensors=return_tensors,
|
|
verbose=verbose,
|
|
)
|
|
|
|
return BatchEncoding(batch_outputs)
|
|
|
|
@add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
|
|
def _batch_prepare_for_model(
|
|
self,
|
|
batch_ids_pairs: List[Union[PreTokenizedInputPair, Tuple[List[int], None]]],
|
|
add_special_tokens: bool = True,
|
|
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
|
|
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
|
|
max_length: Optional[int] = None,
|
|
stride: int = 0,
|
|
pad_to_multiple_of: Optional[int] = None,
|
|
return_tensors: Optional[str] = None,
|
|
return_token_type_ids: Optional[bool] = None,
|
|
return_attention_mask: Optional[bool] = None,
|
|
return_overflowing_tokens: bool = False,
|
|
return_special_tokens_mask: bool = False,
|
|
return_length: bool = False,
|
|
verbose: bool = True,
|
|
) -> BatchEncoding:
|
|
"""
|
|
Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.
|
|
It adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
|
|
manages a moving window (with user defined stride) for overflowing tokens
|
|
|
|
Args:
|
|
batch_ids_pairs: list of tokenized input ids or input ids pairs
|
|
"""
|
|
|
|
batch_outputs = {}
|
|
for first_ids, second_ids in batch_ids_pairs:
|
|
outputs = self.prepare_for_model(
|
|
first_ids,
|
|
second_ids,
|
|
add_special_tokens=add_special_tokens,
|
|
padding=PaddingStrategy.DO_NOT_PAD.value, # we pad in batch afterward
|
|
truncation=truncation_strategy.value,
|
|
max_length=max_length,
|
|
stride=stride,
|
|
pad_to_multiple_of=None, # we pad in batch afterward
|
|
return_attention_mask=False, # we pad in batch afterward
|
|
return_token_type_ids=return_token_type_ids,
|
|
return_overflowing_tokens=return_overflowing_tokens,
|
|
return_special_tokens_mask=return_special_tokens_mask,
|
|
return_length=return_length,
|
|
return_tensors=None, # We convert the whole batch to tensors at the end
|
|
prepend_batch_axis=False,
|
|
verbose=verbose,
|
|
)
|
|
|
|
for key, value in outputs.items():
|
|
if key not in batch_outputs:
|
|
batch_outputs[key] = []
|
|
batch_outputs[key].append(value)
|
|
|
|
batch_outputs = self.pad(
|
|
batch_outputs,
|
|
padding=padding_strategy.value,
|
|
max_length=max_length,
|
|
pad_to_multiple_of=pad_to_multiple_of,
|
|
return_attention_mask=return_attention_mask,
|
|
)
|
|
|
|
batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)
|
|
|
|
return batch_outputs
|
|
|
|
def prepare_for_tokenization(
|
|
self, text: str, is_pretokenized: bool = False, **kwargs
|
|
) -> Tuple[str, Dict[str, Any]]:
|
|
"""
|
|
Performs any necessary transformations before tokenization.
|
|
|
|
This method should pop the arguments from kwargs and return the remaining :obj:`kwargs` as well.
|
|
We test the :obj:`kwargs` at the end of the encoding process to be sure all the arguments have been used.
|
|
|
|
Args:
|
|
test (:obj:`str`):
|
|
The text to prepare.
|
|
is_pretokenized (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
|
Whether or not the text has been pretokenized.
|
|
kwargs:
|
|
Keyword arguments to use for the tokenization.
|
|
|
|
Returns:
|
|
:obj:`Tuple[str, Dict[str, Any]]`: The prepared text and the unused kwargs.
|
|
"""
|
|
return (text, kwargs)
|
|
|
|
def get_special_tokens_mask(
|
|
self, token_ids_0: List, token_ids_1: Optional[List] = None, already_has_special_tokens: bool = False
|
|
) -> List[int]:
|
|
"""
|
|
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
|
|
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
|
|
|
|
Args:
|
|
token_ids_0 (:obj:`List[int]`):
|
|
List of ids of the first sequence.
|
|
token_ids_1 (:obj:`List[int]`, `optional`):
|
|
List of ids of the second sequence.
|
|
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
|
Wheter or not the token list is already formated with special tokens for the model.
|
|
|
|
Returns:
|
|
A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
|
|
"""
|
|
return [0] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0))
|
|
|
|
@overload
|
|
def convert_ids_to_tokens(self, ids: int, skip_special_tokens: bool = False) -> str:
|
|
...
|
|
|
|
@overload
|
|
def convert_ids_to_tokens(self, ids: List[int], skip_special_tokens: bool = False) -> List[str]:
|
|
...
|
|
|
|
def convert_ids_to_tokens(
|
|
self, ids: Union[int, List[int]], skip_special_tokens: bool = False
|
|
) -> Union[str, List[str]]:
|
|
"""
|
|
Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary
|
|
and added tokens.
|
|
|
|
Args:
|
|
ids (:obj:`int` or :obj:`List[int]`):
|
|
The token id (or token ids) to convert to tokens.
|
|
skip_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
|
Whether or not to remove special tokens in the decoding.
|
|
|
|
Returns:
|
|
:obj:`str` or :obj:`List[str]`: The decoded token(s).
|
|
"""
|
|
if isinstance(ids, int):
|
|
if ids in self.added_tokens_decoder:
|
|
return self.added_tokens_decoder[ids]
|
|
else:
|
|
return self._convert_id_to_token(ids)
|
|
tokens = []
|
|
for index in ids:
|
|
index = int(index)
|
|
if skip_special_tokens and index in self.all_special_ids:
|
|
continue
|
|
if index in self.added_tokens_decoder:
|
|
tokens.append(self.added_tokens_decoder[index])
|
|
else:
|
|
tokens.append(self._convert_id_to_token(index))
|
|
return tokens
|
|
|
|
def _convert_id_to_token(self, index: int) -> str:
|
|
raise NotImplementedError
|
|
|
|
def convert_tokens_to_string(self, tokens: List[str]) -> str:
|
|
"""
|
|
Converts a sequence of token ids in a single string.
|
|
|
|
The most simple way to do it is ``" ".join(tokens)`` but we often want to remove
|
|
sub-word tokenization artifacts at the same time.
|
|
|
|
Args:
|
|
tokens (:obj:`List[str]`): The token to join in a string.
|
|
|
|
Return: The joined tokens.
|
|
"""
|
|
return " ".join(tokens)
|
|
|
|
def decode(
|
|
self, token_ids: List[int], skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True
|
|
) -> str:
|
|
"""
|
|
Converts a sequence of ids in a string, using the tokenizer and vocabulary
|
|
with options to remove special tokens and clean up tokenization spaces.
|
|
|
|
Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``.
|
|
|
|
Args:
|
|
token_ids (:obj:`List[int]`):
|
|
List of tokenized input ids. Can be obtained using the ``__call__`` method.
|
|
skip_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
|
Whether or not to remove special tokens in the decoding.
|
|
clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
|
Whether or not to clean up the tokenization spaces.
|
|
|
|
Returns:
|
|
:obj:`str`: The decoded sentence.
|
|
"""
|
|
filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
|
|
|
|
# To avoid mixing byte-level and unicode for byte-level BPT
|
|
# we need to build string separatly for added tokens and byte-level tokens
|
|
# cf. https://github.com/huggingface/transformers/issues/1133
|
|
sub_texts = []
|
|
current_sub_text = []
|
|
for token in filtered_tokens:
|
|
if skip_special_tokens and token in self.all_special_ids:
|
|
continue
|
|
if token in self.added_tokens_encoder:
|
|
if current_sub_text:
|
|
sub_texts.append(self.convert_tokens_to_string(current_sub_text))
|
|
current_sub_text = []
|
|
sub_texts.append(token)
|
|
else:
|
|
current_sub_text.append(token)
|
|
if current_sub_text:
|
|
sub_texts.append(self.convert_tokens_to_string(current_sub_text))
|
|
text = " ".join(sub_texts)
|
|
|
|
if clean_up_tokenization_spaces:
|
|
clean_text = self.clean_up_tokenization(text)
|
|
return clean_text
|
|
else:
|
|
return text
|
|
|
|
def save_vocabulary(self, save_directory) -> Tuple[str]:
|
|
"""
|
|
Save the tokenizer vocabulary to a directory. This method does *NOT* save added tokens
|
|
and special token mappings.
|
|
|
|
.. warning::
|
|
Please use :meth:`~transformers.PreTrainedTokenizer.save_pretrained` to save the full tokenizer state if
|
|
you want to reload it using the :meth:`~transformers.PreTrainedTokenizer.from_pretrained` class method.
|
|
|
|
Args:
|
|
save_directory (:obj:`str`): The path to adirectory where the tokenizer will be saved.
|
|
|
|
Returns:
|
|
A tuple of :obj:`str`: The files saved.
|
|
"""
|
|
raise NotImplementedError
|