mirror of
https://github.com/huggingface/transformers.git
synced 2025-08-02 19:21:31 +06:00
parent
7c9f8f93f9
commit
7a99e4b196
@ -173,9 +173,15 @@ class GPT2Tokenizer(PreTrainedTokenizer):
|
||||
self.cache[token] = word
|
||||
return word
|
||||
|
||||
def _tokenize(self, text):
|
||||
""" Tokenize a string. """
|
||||
text = ' ' + text # GPT-2 (and RoBERTa) tokenizers need at least one space to begin the sentence with.
|
||||
def _tokenize(self, text, add_prefix_space=False):
|
||||
""" Tokenize a string.
|
||||
Args:
|
||||
- add_prefix_space (boolean, default False):
|
||||
Begin the sentence with at least one space toto get invariance to word order in GPT-2 (and RoBERTa) tokenizers.
|
||||
"""
|
||||
if add_prefix_space:
|
||||
text = ' ' + text
|
||||
|
||||
bpe_tokens = []
|
||||
for token in re.findall(self.pat, text):
|
||||
if sys.version_info[0] == 2:
|
||||
|
Loading…
Reference in New Issue
Block a user