From 08cb5718ec206bcf34fcd85a03e3e7cbfab8a9e6 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Thu, 30 Dec 2021 17:30:58 +0100 Subject: [PATCH] Enabling `tokenizers` upgrade. (#14941) * Enabling `tokenizers` upgrade. * Moved ugly comment. * Tokenizers==0.11.1 needs an update to keep borrow checker happy in highly contiguous calls. * Support both 0.11.1 and 0.11.0 --- setup.py | 19 ++++++++++++++++--- src/transformers/dependency_versions_table.py | 2 +- src/transformers/tokenization_utils_fast.py | 18 ++++++++++++++++-- 3 files changed, 33 insertions(+), 6 deletions(-) diff --git a/setup.py b/setup.py index 2a0727b2998..015b7a80422 100644 --- a/setup.py +++ b/setup.py @@ -71,6 +71,7 @@ from pathlib import Path from setuptools import find_packages, setup + # Remove stale transformers.egg-info directory to avoid https://github.com/pypa/pip/issues/5466 stale_egg_info = Path(__file__).parent / "transformers.egg-info" if stale_egg_info.exists(): @@ -148,7 +149,7 @@ _deps = [ "tensorflow>=2.3", "timeout-decorator", "timm", - "tokenizers>=0.10.1,<0.11", + "tokenizers>=0.10.1", "torch>=1.0", "torchaudio", "pyctcdecode>=0.2.0", @@ -256,7 +257,8 @@ extras["integrations"] = extras["optuna"] + extras["ray"] + extras["sigopt"] extras["serving"] = deps_list("pydantic", "uvicorn", "fastapi", "starlette") extras["audio"] = deps_list("librosa", "pyctcdecode", "phonemizer") -extras["speech"] = deps_list("torchaudio") + extras["audio"] # `pip install ".[speech]"` is deprecated and `pip install ".[torch-speech]"` should be used instead +# `pip install ".[speech]"` is deprecated and `pip install ".[torch-speech]"` should be used instead +extras["speech"] = deps_list("torchaudio") + extras["audio"] extras["torch-speech"] = deps_list("torchaudio") + extras["audio"] extras["tf-speech"] = extras["audio"] extras["flax-speech"] = extras["audio"] @@ -267,7 +269,18 @@ extras["codecarbon"] = deps_list("codecarbon") extras["sentencepiece"] = deps_list("sentencepiece", "protobuf") extras["testing"] = ( deps_list( - "pytest", "pytest-xdist", "timeout-decorator", "parameterized", "psutil", "datasets", "pytest-timeout", "black", "sacrebleu", "rouge-score", "nltk", "GitPython" + "pytest", + "pytest-xdist", + "timeout-decorator", + "parameterized", + "psutil", + "datasets", + "pytest-timeout", + "black", + "sacrebleu", + "rouge-score", + "nltk", + "GitPython", ) + extras["retrieval"] + extras["modelcreation"] diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index a83325bfde1..ee8b22b30c2 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -59,7 +59,7 @@ deps = { "tensorflow": "tensorflow>=2.3", "timeout-decorator": "timeout-decorator", "timm": "timm", - "tokenizers": "tokenizers>=0.10.1,<0.11", + "tokenizers": "tokenizers>=0.10.1", "torch": "torch>=1.0", "torchaudio": "torchaudio", "pyctcdecode": "pyctcdecode>=0.2.0", diff --git a/src/transformers/tokenization_utils_fast.py b/src/transformers/tokenization_utils_fast.py index 7d39c9e8181..e06f120b4d7 100644 --- a/src/transformers/tokenization_utils_fast.py +++ b/src/transformers/tokenization_utils_fast.py @@ -352,8 +352,22 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): if _truncation is not None: self._tokenizer.no_truncation() else: - target = {"max_length": max_length, "stride": stride, "strategy": truncation_strategy.value} - if _truncation != target: + target = { + "max_length": max_length, + "stride": stride, + "strategy": truncation_strategy.value, + } + + # _truncation might contain more keys that the target `transformers` + # supports. Use only the target keys to trigger `enable_truncation`. + # This should enable this code to works on various `tokenizers` + # targets. + if _truncation is None: + current = None + else: + current = {k: _truncation.get(k, None) for k in target} + + if current != target: self._tokenizer.enable_truncation(**target) if padding_strategy == PaddingStrategy.DO_NOT_PAD: