From bfd75056b0a080addafb7f3d7c9336d27b883a0e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20Garc=C3=ADa=20Subies?= <37592763+GuillemGSubies@users.noreply.github.com> Date: Tue, 20 Aug 2019 14:06:17 +0200 Subject: [PATCH 1/5] Update tokenization_xlm.py --- pytorch_transformers/tokenization_xlm.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pytorch_transformers/tokenization_xlm.py b/pytorch_transformers/tokenization_xlm.py index b690a3a9458..8e7c2954f2c 100644 --- a/pytorch_transformers/tokenization_xlm.py +++ b/pytorch_transformers/tokenization_xlm.py @@ -124,8 +124,9 @@ class XLMTokenizer(PreTrainedTokenizer): **kwargs) try: import ftfy - import spacy - self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat']) + from spacy.lang.en import English + _nlp = English() + self.nlp = nlp.Defaults.create_tokenizer(_nlp) self.fix_text = ftfy.fix_text except ImportError: logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.") From bb04446285be43059050406b3bc4938807c63c25 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20Garc=C3=ADa=20Subies?= <37592763+GuillemGSubies@users.noreply.github.com> Date: Tue, 20 Aug 2019 14:07:40 +0200 Subject: [PATCH 2/5] Update tokenization_openai.py --- pytorch_transformers/tokenization_openai.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pytorch_transformers/tokenization_openai.py b/pytorch_transformers/tokenization_openai.py index 0eb5281d399..0f6a8f1daec 100644 --- a/pytorch_transformers/tokenization_openai.py +++ b/pytorch_transformers/tokenization_openai.py @@ -89,9 +89,9 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer): try: import ftfy - import spacy - self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat']) - self.fix_text = ftfy.fix_text + from spacy.lang.en import English + _nlp = English() + self.nlp = nlp.Defaults.create_tokenizer(_nlp) except ImportError: logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.") self.nlp = BasicTokenizer(do_lower_case=True) From 562b998366c7a4a2bd0addf1a860fbee0aa04d74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20Garc=C3=ADa=20Subies?= <37592763+GuillemGSubies@users.noreply.github.com> Date: Tue, 20 Aug 2019 14:10:19 +0200 Subject: [PATCH 3/5] Update tokenization_openai.py --- pytorch_transformers/tokenization_openai.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pytorch_transformers/tokenization_openai.py b/pytorch_transformers/tokenization_openai.py index 0f6a8f1daec..79eb023a8d1 100644 --- a/pytorch_transformers/tokenization_openai.py +++ b/pytorch_transformers/tokenization_openai.py @@ -92,6 +92,7 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer): from spacy.lang.en import English _nlp = English() self.nlp = nlp.Defaults.create_tokenizer(_nlp) + self.fix_text = ftfy.fix_text except ImportError: logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.") self.nlp = BasicTokenizer(do_lower_case=True) From f5e2ed0fd89d5730126d71c03324fa07ae674ca7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20Garc=C3=ADa=20Subies?= <37592763+GuillemGSubies@users.noreply.github.com> Date: Tue, 20 Aug 2019 14:19:25 +0200 Subject: [PATCH 4/5] Update tokenization_openai.py --- pytorch_transformers/tokenization_openai.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_transformers/tokenization_openai.py b/pytorch_transformers/tokenization_openai.py index 79eb023a8d1..51b418ebd36 100644 --- a/pytorch_transformers/tokenization_openai.py +++ b/pytorch_transformers/tokenization_openai.py @@ -91,7 +91,7 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer): import ftfy from spacy.lang.en import English _nlp = English() - self.nlp = nlp.Defaults.create_tokenizer(_nlp) + self.nlp = _nlp.Defaults.create_tokenizer(_nlp) self.fix_text = ftfy.fix_text except ImportError: logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.") From 388e3251fa95b892949968dc89065e464a93b69f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20Garc=C3=ADa=20Subies?= <37592763+GuillemGSubies@users.noreply.github.com> Date: Tue, 20 Aug 2019 14:19:39 +0200 Subject: [PATCH 5/5] Update tokenization_xlm.py --- pytorch_transformers/tokenization_xlm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_transformers/tokenization_xlm.py b/pytorch_transformers/tokenization_xlm.py index 8e7c2954f2c..2d2f3a8cd4d 100644 --- a/pytorch_transformers/tokenization_xlm.py +++ b/pytorch_transformers/tokenization_xlm.py @@ -126,7 +126,7 @@ class XLMTokenizer(PreTrainedTokenizer): import ftfy from spacy.lang.en import English _nlp = English() - self.nlp = nlp.Defaults.create_tokenizer(_nlp) + self.nlp = _nlp.Defaults.create_tokenizer(_nlp) self.fix_text = ftfy.fix_text except ImportError: logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")