From bdaba1897c14e0243d7fb58ddf5061957c70eea6 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Tue, 16 Apr 2019 17:44:06 +0200 Subject: [PATCH] updating GPT tokenization --- pytorch_pretrained_bert/tokenization_openai.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pytorch_pretrained_bert/tokenization_openai.py b/pytorch_pretrained_bert/tokenization_openai.py index 1088b5222bc..214a476ce96 100644 --- a/pytorch_pretrained_bert/tokenization_openai.py +++ b/pytorch_pretrained_bert/tokenization_openai.py @@ -273,9 +273,8 @@ class OpenAIGPTTokenizer(object): if clean_up_tokenization_spaces: out_string = out_string.replace('', '') out_string = out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ',').replace(' ,', ',' - ).replace(" n't", "n't").replace(" 'm", "'m").replace(" 're", "'re").replace(" do not", " don't" - ).replace(" 's", "'s").replace(" t ", "'t ").replace(" s ", "'s ").replace(" m ", "'m " - ).replace(" 've", "'ve") + ).replace(" ' ", "'").replace(" n't", "n't").replace(" 'm", "'m").replace(" do not", " don't" + ).replace(" 's", "'s").replace(" 've", "'ve").replace(" 're", "'re") return out_string def save_vocabulary(self, vocab_path):