Make OpenAIGPTTokenizer work with SpaCy 2.x and 3.x (#15019)

* Make OpenAIGPTTokenizer work with SpaCy 3.x

SpaCy 3.x introduced an API change to creating the tokenizer that
breaks OpenAIGPTTokenizer. The old API for creating the tokenizer in
SpaCy 2.x no longer works under SpaCy 3.x, but the new API for creating
the tokenizer in SpaCy 3.x DOES work under SpaCy 2.x. Switching to the
new API should allow OpenAIGPTTokenizer to work under both SpaCy 2.x and
SpaCy 3.x versions.

* Add is_spacy_available and is_ftfy_available methods to file utils

* Add spacy and ftfy unittest decorator to testing utils

* Add tests for OpenAIGPTTokenizer that require spacy and ftfy

* Modify CircleCI config to run tests that require spacy and ftfy

* Remove unneeded unittest decorators are reuse test code

* Run make fixup
This commit is contained in:
cody-moveworks 2022-01-10 04:53:20 -08:00 committed by GitHub
parent 9fbf7c87c3
commit a54961c5f7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 53 additions and 13 deletions

View File

@ -99,7 +99,7 @@ jobs:
path: ~/transformers/tests_output.txt path: ~/transformers/tests_output.txt
- store_artifacts: - store_artifacts:
path: ~/transformers/reports path: ~/transformers/reports
run_tests_torch_and_tf_all: run_tests_torch_and_tf_all:
working_directory: ~/transformers working_directory: ~/transformers
docker: docker:
@ -169,7 +169,7 @@ jobs:
path: ~/transformers/tests_output.txt path: ~/transformers/tests_output.txt
- store_artifacts: - store_artifacts:
path: ~/transformers/reports path: ~/transformers/reports
run_tests_torch_and_flax_all: run_tests_torch_and_flax_all:
working_directory: ~/transformers working_directory: ~/transformers
docker: docker:
@ -237,7 +237,7 @@ jobs:
path: ~/transformers/tests_output.txt path: ~/transformers/tests_output.txt
- store_artifacts: - store_artifacts:
path: ~/transformers/reports path: ~/transformers/reports
run_tests_torch_all: run_tests_torch_all:
working_directory: ~/transformers working_directory: ~/transformers
docker: docker:
@ -304,7 +304,7 @@ jobs:
path: ~/transformers/tests_output.txt path: ~/transformers/tests_output.txt
- store_artifacts: - store_artifacts:
path: ~/transformers/reports path: ~/transformers/reports
run_tests_tf_all: run_tests_tf_all:
working_directory: ~/transformers working_directory: ~/transformers
docker: docker:
@ -370,7 +370,7 @@ jobs:
path: ~/transformers/tests_output.txt path: ~/transformers/tests_output.txt
- store_artifacts: - store_artifacts:
path: ~/transformers/reports path: ~/transformers/reports
run_tests_flax_all: run_tests_flax_all:
working_directory: ~/transformers working_directory: ~/transformers
docker: docker:
@ -437,7 +437,7 @@ jobs:
path: ~/transformers/tests_output.txt path: ~/transformers/tests_output.txt
- store_artifacts: - store_artifacts:
path: ~/transformers/reports path: ~/transformers/reports
run_tests_pipelines_torch_all: run_tests_pipelines_torch_all:
working_directory: ~/transformers working_directory: ~/transformers
docker: docker:
@ -549,7 +549,7 @@ jobs:
- v0.4-custom_tokenizers-{{ checksum "setup.py" }} - v0.4-custom_tokenizers-{{ checksum "setup.py" }}
- v0.4-{{ checksum "setup.py" }} - v0.4-{{ checksum "setup.py" }}
- run: pip install --upgrade pip - run: pip install --upgrade pip
- run: pip install .[ja,testing,sentencepiece,jieba] - run: pip install .[ja,testing,sentencepiece,jieba,spacy,ftfy]
- run: python -m unidic download - run: python -m unidic download
- save_cache: - save_cache:
key: v0.4-custom_tokenizers-{{ checksum "setup.py" }} key: v0.4-custom_tokenizers-{{ checksum "setup.py" }}
@ -557,7 +557,7 @@ jobs:
- '~/.cache/pip' - '~/.cache/pip'
- run: | - run: |
if [ -f test_list.txt ]; then if [ -f test_list.txt ]; then
python -m pytest -s --make-reports=tests_custom_tokenizers ./tests/test_tokenization_bert_japanese.py | tee tests_output.txt python -m pytest -s --make-reports=tests_custom_tokenizers ./tests/test_tokenization_bert_japanese.py ./tests/test_tokenization_openai.py | tee tests_output.txt
fi fi
- store_artifacts: - store_artifacts:
path: ~/transformers/tests_output.txt path: ~/transformers/tests_output.txt
@ -662,7 +662,7 @@ jobs:
path: ~/transformers/flax_examples_output.txt path: ~/transformers/flax_examples_output.txt
- store_artifacts: - store_artifacts:
path: ~/transformers/reports path: ~/transformers/reports
run_examples_flax_all: run_examples_flax_all:
working_directory: ~/transformers working_directory: ~/transformers
docker: docker:
@ -729,7 +729,7 @@ jobs:
path: ~/transformers/tests_output.txt path: ~/transformers/tests_output.txt
- store_artifacts: - store_artifacts:
path: ~/transformers/reports path: ~/transformers/reports
run_tests_hub_all: run_tests_hub_all:
working_directory: ~/transformers working_directory: ~/transformers
docker: docker:
@ -795,7 +795,7 @@ jobs:
path: ~/transformers/tests_output.txt path: ~/transformers/tests_output.txt
- store_artifacts: - store_artifacts:
path: ~/transformers/reports path: ~/transformers/reports
run_tests_onnxruntime_all: run_tests_onnxruntime_all:
working_directory: ~/transformers working_directory: ~/transformers
docker: docker:

View File

@ -512,6 +512,14 @@ def is_pytesseract_available():
return importlib.util.find_spec("pytesseract") is not None return importlib.util.find_spec("pytesseract") is not None
def is_spacy_available():
return importlib.util.find_spec("spacy") is not None
def is_ftfy_available():
return importlib.util.find_spec("ftfy") is not None
def is_in_notebook(): def is_in_notebook():
try: try:
# Test adapted from tqdm.autonotebook: https://github.com/tqdm/tqdm/blob/master/tqdm/autonotebook.py # Test adapted from tqdm.autonotebook: https://github.com/tqdm/tqdm/blob/master/tqdm/autonotebook.py

View File

@ -104,7 +104,7 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
from spacy.lang.en import English from spacy.lang.en import English
_nlp = English() _nlp = English()
self.nlp = _nlp.Defaults.create_tokenizer(_nlp) self.nlp = _nlp.tokenizer
self.fix_text = ftfy.fix_text self.fix_text = ftfy.fix_text
except ImportError: except ImportError:
logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.") logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")

View File

@ -34,6 +34,7 @@ from .file_utils import (
is_detectron2_available, is_detectron2_available,
is_faiss_available, is_faiss_available,
is_flax_available, is_flax_available,
is_ftfy_available,
is_keras2onnx_available, is_keras2onnx_available,
is_librosa_available, is_librosa_available,
is_onnx_available, is_onnx_available,
@ -46,6 +47,7 @@ from .file_utils import (
is_scatter_available, is_scatter_available,
is_sentencepiece_available, is_sentencepiece_available,
is_soundfile_availble, is_soundfile_availble,
is_spacy_available,
is_tensorflow_probability_available, is_tensorflow_probability_available,
is_tf_available, is_tf_available,
is_timm_available, is_timm_available,
@ -412,6 +414,26 @@ def require_vision(test_case):
return test_case return test_case
def require_ftfy(test_case):
"""
Decorator marking a test that requires ftfy. These tests are skipped when ftfy isn't installed.
"""
if not is_ftfy_available():
return unittest.skip("test requires ftfy")(test_case)
else:
return test_case
def require_spacy(test_case):
"""
Decorator marking a test that requires SpaCy. These tests are skipped when SpaCy isn't installed.
"""
if not is_spacy_available():
return unittest.skip("test requires spacy")(test_case)
else:
return test_case
def require_torch_multi_gpu(test_case): def require_torch_multi_gpu(test_case):
""" """
Decorator marking a test that requires a multi-GPU setup (in PyTorch). These tests are skipped on a machine without Decorator marking a test that requires a multi-GPU setup (in PyTorch). These tests are skipped on a machine without

View File

@ -20,13 +20,14 @@ import unittest
from transformers import OpenAIGPTTokenizer, OpenAIGPTTokenizerFast from transformers import OpenAIGPTTokenizer, OpenAIGPTTokenizerFast
from transformers.models.openai.tokenization_openai import VOCAB_FILES_NAMES from transformers.models.openai.tokenization_openai import VOCAB_FILES_NAMES
from transformers.testing_utils import require_tokenizers from transformers.testing_utils import require_ftfy, require_spacy, require_tokenizers
from .test_tokenization_common import TokenizerTesterMixin from .test_tokenization_common import TokenizerTesterMixin
@require_tokenizers @require_tokenizers
class OpenAIGPTTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class OpenAIGPTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
"""Tests OpenAIGPTTokenizer that uses BERT BasicTokenizer."""
tokenizer_class = OpenAIGPTTokenizer tokenizer_class = OpenAIGPTTokenizer
rust_tokenizer_class = OpenAIGPTTokenizerFast rust_tokenizer_class = OpenAIGPTTokenizerFast
@ -132,3 +133,12 @@ class OpenAIGPTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
# tokenizer has no padding token # tokenizer has no padding token
def test_padding_different_model_input_name(self): def test_padding_different_model_input_name(self):
pass pass
@require_ftfy
@require_spacy
@require_tokenizers
class OpenAIGPTTokenizationTestWithSpacy(OpenAIGPTTokenizationTest):
"""Tests OpenAIGPTTokenizer that uses SpaCy and ftfy."""
pass