fix #1196 and fix #1285

2025-08-02 19:21:31 +06:00 · 2019-09-26 08:41:02 +02:00 · 2019-09-26 08:41:02 +02:00 · 7a99e4b196
commit 7a99e4b196
parent 7c9f8f93f9
1 changed files with 9 additions and 3 deletions
--- a/pytorch_transformers/tokenization_gpt2.py
+++ b/pytorch_transformers/tokenization_gpt2.py
@ -173,9 +173,15 @@ class GPT2Tokenizer(PreTrainedTokenizer):
        self.cache[token] = word
        return word

-    def _tokenize(self, text):
-        """ Tokenize a string. """
-        text = ' ' + text  # GPT-2 (and RoBERTa) tokenizers need at least one space to begin the sentence with.
+    def _tokenize(self, text, add_prefix_space=False):
+        """ Tokenize a string.
+            Args:
+                - add_prefix_space (boolean, default False):
+                    Begin the sentence with at least one space toto get invariance to word order in GPT-2 (and RoBERTa) tokenizers.
+        """
+        if add_prefix_space:
+            text = ' ' + text
+
        bpe_tokens = []
        for token in re.findall(self.pat, text):
            if sys.version_info[0] == 2: