mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-30 17:52:35 +06:00
Enabling tokenizers
upgrade. (#14941)
* Enabling `tokenizers` upgrade. * Moved ugly comment. * Tokenizers==0.11.1 needs an update to keep borrow checker happy in highly contiguous calls. * Support both 0.11.1 and 0.11.0
This commit is contained in:
parent
f8a989cfb2
commit
08cb5718ec
19
setup.py
19
setup.py
@ -71,6 +71,7 @@ from pathlib import Path
|
||||
|
||||
from setuptools import find_packages, setup
|
||||
|
||||
|
||||
# Remove stale transformers.egg-info directory to avoid https://github.com/pypa/pip/issues/5466
|
||||
stale_egg_info = Path(__file__).parent / "transformers.egg-info"
|
||||
if stale_egg_info.exists():
|
||||
@ -148,7 +149,7 @@ _deps = [
|
||||
"tensorflow>=2.3",
|
||||
"timeout-decorator",
|
||||
"timm",
|
||||
"tokenizers>=0.10.1,<0.11",
|
||||
"tokenizers>=0.10.1",
|
||||
"torch>=1.0",
|
||||
"torchaudio",
|
||||
"pyctcdecode>=0.2.0",
|
||||
@ -256,7 +257,8 @@ extras["integrations"] = extras["optuna"] + extras["ray"] + extras["sigopt"]
|
||||
|
||||
extras["serving"] = deps_list("pydantic", "uvicorn", "fastapi", "starlette")
|
||||
extras["audio"] = deps_list("librosa", "pyctcdecode", "phonemizer")
|
||||
extras["speech"] = deps_list("torchaudio") + extras["audio"] # `pip install ".[speech]"` is deprecated and `pip install ".[torch-speech]"` should be used instead
|
||||
# `pip install ".[speech]"` is deprecated and `pip install ".[torch-speech]"` should be used instead
|
||||
extras["speech"] = deps_list("torchaudio") + extras["audio"]
|
||||
extras["torch-speech"] = deps_list("torchaudio") + extras["audio"]
|
||||
extras["tf-speech"] = extras["audio"]
|
||||
extras["flax-speech"] = extras["audio"]
|
||||
@ -267,7 +269,18 @@ extras["codecarbon"] = deps_list("codecarbon")
|
||||
extras["sentencepiece"] = deps_list("sentencepiece", "protobuf")
|
||||
extras["testing"] = (
|
||||
deps_list(
|
||||
"pytest", "pytest-xdist", "timeout-decorator", "parameterized", "psutil", "datasets", "pytest-timeout", "black", "sacrebleu", "rouge-score", "nltk", "GitPython"
|
||||
"pytest",
|
||||
"pytest-xdist",
|
||||
"timeout-decorator",
|
||||
"parameterized",
|
||||
"psutil",
|
||||
"datasets",
|
||||
"pytest-timeout",
|
||||
"black",
|
||||
"sacrebleu",
|
||||
"rouge-score",
|
||||
"nltk",
|
||||
"GitPython",
|
||||
)
|
||||
+ extras["retrieval"]
|
||||
+ extras["modelcreation"]
|
||||
|
@ -59,7 +59,7 @@ deps = {
|
||||
"tensorflow": "tensorflow>=2.3",
|
||||
"timeout-decorator": "timeout-decorator",
|
||||
"timm": "timm",
|
||||
"tokenizers": "tokenizers>=0.10.1,<0.11",
|
||||
"tokenizers": "tokenizers>=0.10.1",
|
||||
"torch": "torch>=1.0",
|
||||
"torchaudio": "torchaudio",
|
||||
"pyctcdecode": "pyctcdecode>=0.2.0",
|
||||
|
@ -352,8 +352,22 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
|
||||
if _truncation is not None:
|
||||
self._tokenizer.no_truncation()
|
||||
else:
|
||||
target = {"max_length": max_length, "stride": stride, "strategy": truncation_strategy.value}
|
||||
if _truncation != target:
|
||||
target = {
|
||||
"max_length": max_length,
|
||||
"stride": stride,
|
||||
"strategy": truncation_strategy.value,
|
||||
}
|
||||
|
||||
# _truncation might contain more keys that the target `transformers`
|
||||
# supports. Use only the target keys to trigger `enable_truncation`.
|
||||
# This should enable this code to works on various `tokenizers`
|
||||
# targets.
|
||||
if _truncation is None:
|
||||
current = None
|
||||
else:
|
||||
current = {k: _truncation.get(k, None) for k in target}
|
||||
|
||||
if current != target:
|
||||
self._tokenizer.enable_truncation(**target)
|
||||
|
||||
if padding_strategy == PaddingStrategy.DO_NOT_PAD:
|
||||
|
Loading…
Reference in New Issue
Block a user