From 08cb5718ec206bcf34fcd85a03e3e7cbfab8a9e6 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Thu, 30 Dec 2021 17:30:58 +0100
Subject: [PATCH] Enabling `tokenizers` upgrade. (#14941)

* Enabling `tokenizers` upgrade.

* Moved ugly comment.

* Tokenizers==0.11.1 needs an update to keep borrow checker

happy in highly contiguous calls.

* Support both 0.11.1 and 0.11.0
---
 setup.py                                      | 19 ++++++++++++++++---
 src/transformers/dependency_versions_table.py |  2 +-
 src/transformers/tokenization_utils_fast.py   | 18 ++++++++++++++++--
 3 files changed, 33 insertions(+), 6 deletions(-)

diff --git a/setup.py b/setup.py
index 2a0727b2998..015b7a80422 100644
--- a/setup.py
+++ b/setup.py
@@ -71,6 +71,7 @@ from pathlib import Path
 
 from setuptools import find_packages, setup
 
+
 # Remove stale transformers.egg-info directory to avoid https://github.com/pypa/pip/issues/5466
 stale_egg_info = Path(__file__).parent / "transformers.egg-info"
 if stale_egg_info.exists():
@@ -148,7 +149,7 @@ _deps = [
     "tensorflow>=2.3",
     "timeout-decorator",
     "timm",
-    "tokenizers>=0.10.1,<0.11",
+    "tokenizers>=0.10.1",
     "torch>=1.0",
     "torchaudio",
     "pyctcdecode>=0.2.0",
@@ -256,7 +257,8 @@ extras["integrations"] = extras["optuna"] + extras["ray"] + extras["sigopt"]
 
 extras["serving"] = deps_list("pydantic", "uvicorn", "fastapi", "starlette")
 extras["audio"] = deps_list("librosa", "pyctcdecode", "phonemizer")
-extras["speech"] = deps_list("torchaudio") + extras["audio"]  # `pip install ".[speech]"` is deprecated and `pip install ".[torch-speech]"` should be used instead
+# `pip install ".[speech]"` is deprecated and `pip install ".[torch-speech]"` should be used instead
+extras["speech"] = deps_list("torchaudio") + extras["audio"]
 extras["torch-speech"] = deps_list("torchaudio") + extras["audio"]
 extras["tf-speech"] = extras["audio"]
 extras["flax-speech"] = extras["audio"]
@@ -267,7 +269,18 @@ extras["codecarbon"] = deps_list("codecarbon")
 extras["sentencepiece"] = deps_list("sentencepiece", "protobuf")
 extras["testing"] = (
     deps_list(
-        "pytest", "pytest-xdist", "timeout-decorator", "parameterized", "psutil", "datasets", "pytest-timeout", "black", "sacrebleu", "rouge-score", "nltk", "GitPython"
+        "pytest",
+        "pytest-xdist",
+        "timeout-decorator",
+        "parameterized",
+        "psutil",
+        "datasets",
+        "pytest-timeout",
+        "black",
+        "sacrebleu",
+        "rouge-score",
+        "nltk",
+        "GitPython",
     )
     + extras["retrieval"]
     + extras["modelcreation"]
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index a83325bfde1..ee8b22b30c2 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -59,7 +59,7 @@ deps = {
     "tensorflow": "tensorflow>=2.3",
     "timeout-decorator": "timeout-decorator",
     "timm": "timm",
-    "tokenizers": "tokenizers>=0.10.1,<0.11",
+    "tokenizers": "tokenizers>=0.10.1",
     "torch": "torch>=1.0",
     "torchaudio": "torchaudio",
     "pyctcdecode": "pyctcdecode>=0.2.0",
diff --git a/src/transformers/tokenization_utils_fast.py b/src/transformers/tokenization_utils_fast.py
index 7d39c9e8181..e06f120b4d7 100644
--- a/src/transformers/tokenization_utils_fast.py
+++ b/src/transformers/tokenization_utils_fast.py
@@ -352,8 +352,22 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
             if _truncation is not None:
                 self._tokenizer.no_truncation()
         else:
-            target = {"max_length": max_length, "stride": stride, "strategy": truncation_strategy.value}
-            if _truncation != target:
+            target = {
+                "max_length": max_length,
+                "stride": stride,
+                "strategy": truncation_strategy.value,
+            }
+
+            # _truncation might contain more keys that the target `transformers`
+            # supports. Use only the target keys to trigger `enable_truncation`.
+            # This should enable this code to works on various `tokenizers`
+            # targets.
+            if _truncation is None:
+                current = None
+            else:
+                current = {k: _truncation.get(k, None) for k in target}
+
+            if current != target:
                 self._tokenizer.enable_truncation(**target)
 
         if padding_strategy == PaddingStrategy.DO_NOT_PAD: