Merge pull request #1059 from GuillemGSubies/master

Better use of spacy tokenizer in open ai and xlm tokenizers
This commit is contained in:
Thomas Wolf 2019-08-21 01:53:48 +02:00 committed by GitHub
commit 41789c6c3d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 6 additions and 4 deletions

View File

@ -89,8 +89,9 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
try:
import ftfy
import spacy
self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat'])
from spacy.lang.en import English
_nlp = English()
self.nlp = _nlp.Defaults.create_tokenizer(_nlp)
self.fix_text = ftfy.fix_text
except ImportError:
logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")

View File

@ -124,8 +124,9 @@ class XLMTokenizer(PreTrainedTokenizer):
**kwargs)
try:
import ftfy
import spacy
self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat'])
from spacy.lang.en import English
_nlp = English()
self.nlp = _nlp.Defaults.create_tokenizer(_nlp)
self.fix_text = ftfy.fix_text
except ImportError:
logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")