diff --git a/transformers/tokenization_gpt2.py b/transformers/tokenization_gpt2.py index 3d5d3029dd9..3e931dfcf89 100644 --- a/transformers/tokenization_gpt2.py +++ b/transformers/tokenization_gpt2.py @@ -101,9 +101,10 @@ class GPT2Tokenizer(PreTrainedTokenizer): """ GPT-2 BPE tokenizer. Peculiarities: - Byte-level Byte-Pair-Encoding - - Requires a space to start the input string => will add a space is there isn't. - As a consequence, this tokenizer `encode` and `decode` method will not conserve - the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello" + - Requires a space to start the input string => the encoding methods should be called with the + ``add_prefix_space`` flag set to ``True``. + Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve + the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello"` """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP diff --git a/transformers/tokenization_roberta.py b/transformers/tokenization_roberta.py index ee8e97d6bfa..32e92be2113 100644 --- a/transformers/tokenization_roberta.py +++ b/transformers/tokenization_roberta.py @@ -66,9 +66,10 @@ class RobertaTokenizer(GPT2Tokenizer): """ RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities: - Byte-level Byte-Pair-Encoding - - Requires a space to start the input string => will add a space is there isn't. - As a consequence, this tokenizer `encode` and `decode` method will not conserve - the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello" + - Requires a space to start the input string => the encoding methods should be called with the + ``add_prefix_space`` flag set to ``True``. + Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve + the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello"` """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP