mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-31 18:22:34 +06:00
Improve type checker performance (#13094)
* conditional declare `TOKENIZER_MAPPING_NAMES` within a `if TYPE_CHECKING` block so that type checkers dont need to evaluate the RHS of the assignment. this improves performance of the pylance/pyright type checkers * Update src/transformers/models/auto/tokenization_auto.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * adding missing import * format Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
This commit is contained in:
parent
b9962b8656
commit
e46ad22cd6
@ -18,7 +18,7 @@ import importlib
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
from typing import Dict, Optional, Union
|
from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union
|
||||||
|
|
||||||
from ...configuration_utils import PretrainedConfig
|
from ...configuration_utils import PretrainedConfig
|
||||||
from ...file_utils import (
|
from ...file_utils import (
|
||||||
@ -43,153 +43,163 @@ from .configuration_auto import (
|
|||||||
|
|
||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
TOKENIZER_MAPPING_NAMES = OrderedDict(
|
# This significantly improves completion suggestion performance when
|
||||||
[
|
# the transformers package is used with Microsoft's Pylance language server.
|
||||||
("retribert", ("RetriBertTokenizer", "RetriBertTokenizerFast" if is_tokenizers_available() else None)),
|
TOKENIZER_MAPPING_NAMES: OrderedDict[str, Tuple[Optional[str], Optional[str]]] = OrderedDict()
|
||||||
("roformer", ("RoFormerTokenizer", "RoFormerTokenizerFast" if is_tokenizers_available() else None)),
|
else:
|
||||||
(
|
TOKENIZER_MAPPING_NAMES = OrderedDict(
|
||||||
"t5",
|
[
|
||||||
|
("retribert", ("RetriBertTokenizer", "RetriBertTokenizerFast" if is_tokenizers_available() else None)),
|
||||||
|
("roformer", ("RoFormerTokenizer", "RoFormerTokenizerFast" if is_tokenizers_available() else None)),
|
||||||
(
|
(
|
||||||
"T5Tokenizer" if is_sentencepiece_available() else None,
|
"t5",
|
||||||
"T5TokenizerFast" if is_tokenizers_available() else None,
|
(
|
||||||
|
"T5Tokenizer" if is_sentencepiece_available() else None,
|
||||||
|
"T5TokenizerFast" if is_tokenizers_available() else None,
|
||||||
|
),
|
||||||
),
|
),
|
||||||
),
|
|
||||||
(
|
|
||||||
"mt5",
|
|
||||||
(
|
(
|
||||||
"MT5Tokenizer" if is_sentencepiece_available() else None,
|
"mt5",
|
||||||
"MT5TokenizerFast" if is_tokenizers_available() else None,
|
(
|
||||||
|
"MT5Tokenizer" if is_sentencepiece_available() else None,
|
||||||
|
"MT5TokenizerFast" if is_tokenizers_available() else None,
|
||||||
|
),
|
||||||
),
|
),
|
||||||
),
|
("mobilebert", ("MobileBertTokenizer", "MobileBertTokenizerFast" if is_tokenizers_available() else None)),
|
||||||
("mobilebert", ("MobileBertTokenizer", "MobileBertTokenizerFast" if is_tokenizers_available() else None)),
|
("distilbert", ("DistilBertTokenizer", "DistilBertTokenizerFast" if is_tokenizers_available() else None)),
|
||||||
("distilbert", ("DistilBertTokenizer", "DistilBertTokenizerFast" if is_tokenizers_available() else None)),
|
|
||||||
(
|
|
||||||
"albert",
|
|
||||||
(
|
(
|
||||||
"AlbertTokenizer" if is_sentencepiece_available() else None,
|
"albert",
|
||||||
"AlbertTokenizerFast" if is_tokenizers_available() else None,
|
(
|
||||||
|
"AlbertTokenizer" if is_sentencepiece_available() else None,
|
||||||
|
"AlbertTokenizerFast" if is_tokenizers_available() else None,
|
||||||
|
),
|
||||||
),
|
),
|
||||||
),
|
|
||||||
(
|
|
||||||
"camembert",
|
|
||||||
(
|
(
|
||||||
"CamembertTokenizer" if is_sentencepiece_available() else None,
|
"camembert",
|
||||||
"CamembertTokenizerFast" if is_tokenizers_available() else None,
|
(
|
||||||
|
"CamembertTokenizer" if is_sentencepiece_available() else None,
|
||||||
|
"CamembertTokenizerFast" if is_tokenizers_available() else None,
|
||||||
|
),
|
||||||
),
|
),
|
||||||
),
|
|
||||||
(
|
|
||||||
"pegasus",
|
|
||||||
(
|
(
|
||||||
"PegasusTokenizer" if is_sentencepiece_available() else None,
|
"pegasus",
|
||||||
"PegasusTokenizerFast" if is_tokenizers_available() else None,
|
(
|
||||||
|
"PegasusTokenizer" if is_sentencepiece_available() else None,
|
||||||
|
"PegasusTokenizerFast" if is_tokenizers_available() else None,
|
||||||
|
),
|
||||||
),
|
),
|
||||||
),
|
|
||||||
(
|
|
||||||
"mbart",
|
|
||||||
(
|
(
|
||||||
"MBartTokenizer" if is_sentencepiece_available() else None,
|
"mbart",
|
||||||
"MBartTokenizerFast" if is_tokenizers_available() else None,
|
(
|
||||||
|
"MBartTokenizer" if is_sentencepiece_available() else None,
|
||||||
|
"MBartTokenizerFast" if is_tokenizers_available() else None,
|
||||||
|
),
|
||||||
),
|
),
|
||||||
),
|
|
||||||
(
|
|
||||||
"xlm-roberta",
|
|
||||||
(
|
(
|
||||||
"XLMRobertaTokenizer" if is_sentencepiece_available() else None,
|
"xlm-roberta",
|
||||||
"XLMRobertaTokenizerFast" if is_tokenizers_available() else None,
|
(
|
||||||
|
"XLMRobertaTokenizer" if is_sentencepiece_available() else None,
|
||||||
|
"XLMRobertaTokenizerFast" if is_tokenizers_available() else None,
|
||||||
|
),
|
||||||
),
|
),
|
||||||
),
|
("marian", ("MarianTokenizer" if is_sentencepiece_available() else None, None)),
|
||||||
("marian", ("MarianTokenizer" if is_sentencepiece_available() else None, None)),
|
("blenderbot-small", ("BlenderbotSmallTokenizer", None)),
|
||||||
("blenderbot-small", ("BlenderbotSmallTokenizer", None)),
|
("blenderbot", ("BlenderbotTokenizer", None)),
|
||||||
("blenderbot", ("BlenderbotTokenizer", None)),
|
("bart", ("BartTokenizer", "BartTokenizerFast")),
|
||||||
("bart", ("BartTokenizer", "BartTokenizerFast")),
|
("longformer", ("LongformerTokenizer", "LongformerTokenizerFast" if is_tokenizers_available() else None)),
|
||||||
("longformer", ("LongformerTokenizer", "LongformerTokenizerFast" if is_tokenizers_available() else None)),
|
("roberta", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
|
||||||
("roberta", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
|
|
||||||
(
|
|
||||||
"reformer",
|
|
||||||
(
|
(
|
||||||
"ReformerTokenizer" if is_sentencepiece_available() else None,
|
"reformer",
|
||||||
"ReformerTokenizerFast" if is_tokenizers_available() else None,
|
(
|
||||||
|
"ReformerTokenizer" if is_sentencepiece_available() else None,
|
||||||
|
"ReformerTokenizerFast" if is_tokenizers_available() else None,
|
||||||
|
),
|
||||||
),
|
),
|
||||||
),
|
("electra", ("ElectraTokenizer", "ElectraTokenizerFast" if is_tokenizers_available() else None)),
|
||||||
("electra", ("ElectraTokenizer", "ElectraTokenizerFast" if is_tokenizers_available() else None)),
|
("funnel", ("FunnelTokenizer", "FunnelTokenizerFast" if is_tokenizers_available() else None)),
|
||||||
("funnel", ("FunnelTokenizer", "FunnelTokenizerFast" if is_tokenizers_available() else None)),
|
("lxmert", ("LxmertTokenizer", "LxmertTokenizerFast" if is_tokenizers_available() else None)),
|
||||||
("lxmert", ("LxmertTokenizer", "LxmertTokenizerFast" if is_tokenizers_available() else None)),
|
("layoutlm", ("LayoutLMTokenizer", "LayoutLMTokenizerFast" if is_tokenizers_available() else None)),
|
||||||
("layoutlm", ("LayoutLMTokenizer", "LayoutLMTokenizerFast" if is_tokenizers_available() else None)),
|
|
||||||
(
|
|
||||||
"dpr",
|
|
||||||
("DPRQuestionEncoderTokenizer", "DPRQuestionEncoderTokenizerFast" if is_tokenizers_available() else None),
|
|
||||||
),
|
|
||||||
("squeezebert", ("SqueezeBertTokenizer", "SqueezeBertTokenizerFast" if is_tokenizers_available() else None)),
|
|
||||||
("bert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
|
|
||||||
("openai-gpt", ("OpenAIGPTTokenizer", "OpenAIGPTTokenizerFast" if is_tokenizers_available() else None)),
|
|
||||||
("gpt2", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
|
|
||||||
("transfo-xl", ("TransfoXLTokenizer", None)),
|
|
||||||
(
|
|
||||||
"xlnet",
|
|
||||||
(
|
(
|
||||||
"XLNetTokenizer" if is_sentencepiece_available() else None,
|
"dpr",
|
||||||
"XLNetTokenizerFast" if is_tokenizers_available() else None,
|
(
|
||||||
|
"DPRQuestionEncoderTokenizer",
|
||||||
|
"DPRQuestionEncoderTokenizerFast" if is_tokenizers_available() else None,
|
||||||
|
),
|
||||||
),
|
),
|
||||||
),
|
|
||||||
("flaubert", ("FlaubertTokenizer", None)),
|
|
||||||
("xlm", ("XLMTokenizer", None)),
|
|
||||||
("ctrl", ("CTRLTokenizer", None)),
|
|
||||||
("fsmt", ("FSMTTokenizer", None)),
|
|
||||||
("bert-generation", ("BertGenerationTokenizer" if is_sentencepiece_available() else None, None)),
|
|
||||||
("deberta", ("DebertaTokenizer", "DebertaTokenizerFast" if is_tokenizers_available() else None)),
|
|
||||||
("deberta-v2", ("DebertaV2Tokenizer" if is_sentencepiece_available() else None, None)),
|
|
||||||
("rag", ("RagTokenizer", None)),
|
|
||||||
("xlm-prophetnet", ("XLMProphetNetTokenizer" if is_sentencepiece_available() else None, None)),
|
|
||||||
("speech_to_text", ("Speech2TextTokenizer" if is_sentencepiece_available() else None, None)),
|
|
||||||
("m2m_100", ("M2M100Tokenizer" if is_sentencepiece_available() else None, None)),
|
|
||||||
("prophetnet", ("ProphetNetTokenizer", None)),
|
|
||||||
("mpnet", ("MPNetTokenizer", "MPNetTokenizerFast" if is_tokenizers_available() else None)),
|
|
||||||
("tapas", ("TapasTokenizer", None)),
|
|
||||||
("led", ("LEDTokenizer", "LEDTokenizerFast" if is_tokenizers_available() else None)),
|
|
||||||
("convbert", ("ConvBertTokenizer", "ConvBertTokenizerFast" if is_tokenizers_available() else None)),
|
|
||||||
(
|
|
||||||
"big_bird",
|
|
||||||
(
|
(
|
||||||
"BigBirdTokenizer" if is_sentencepiece_available() else None,
|
"squeezebert",
|
||||||
"BigBirdTokenizerFast" if is_tokenizers_available() else None,
|
("SqueezeBertTokenizer", "SqueezeBertTokenizerFast" if is_tokenizers_available() else None),
|
||||||
),
|
),
|
||||||
),
|
("bert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
|
||||||
("ibert", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
|
("openai-gpt", ("OpenAIGPTTokenizer", "OpenAIGPTTokenizerFast" if is_tokenizers_available() else None)),
|
||||||
("wav2vec2", ("Wav2Vec2CTCTokenizer", None)),
|
("gpt2", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
|
||||||
("hubert", ("Wav2Vec2CTCTokenizer", None)),
|
("transfo-xl", ("TransfoXLTokenizer", None)),
|
||||||
("gpt_neo", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
|
|
||||||
("luke", ("LukeTokenizer", None)),
|
|
||||||
("bigbird_pegasus", ("PegasusTokenizer", "PegasusTokenizerFast" if is_tokenizers_available() else None)),
|
|
||||||
("canine", ("CanineTokenizer", None)),
|
|
||||||
("bertweet", ("BertweetTokenizer", None)),
|
|
||||||
("bert-japanese", ("BertJapaneseTokenizer", None)),
|
|
||||||
("byt5", ("ByT5Tokenizer", None)),
|
|
||||||
(
|
|
||||||
"cpm",
|
|
||||||
(
|
(
|
||||||
"CpmTokenizer" if is_sentencepiece_available() else None,
|
"xlnet",
|
||||||
"CpmTokenizerFast" if is_tokenizers_available() else None,
|
(
|
||||||
|
"XLNetTokenizer" if is_sentencepiece_available() else None,
|
||||||
|
"XLNetTokenizerFast" if is_tokenizers_available() else None,
|
||||||
|
),
|
||||||
),
|
),
|
||||||
),
|
("flaubert", ("FlaubertTokenizer", None)),
|
||||||
("herbert", ("HerbertTokenizer", "HerbertTokenizerFast" if is_tokenizers_available() else None)),
|
("xlm", ("XLMTokenizer", None)),
|
||||||
("phobert", ("PhobertTokenizer", None)),
|
("ctrl", ("CTRLTokenizer", None)),
|
||||||
(
|
("fsmt", ("FSMTTokenizer", None)),
|
||||||
"barthez",
|
("bert-generation", ("BertGenerationTokenizer" if is_sentencepiece_available() else None, None)),
|
||||||
|
("deberta", ("DebertaTokenizer", "DebertaTokenizerFast" if is_tokenizers_available() else None)),
|
||||||
|
("deberta-v2", ("DebertaV2Tokenizer" if is_sentencepiece_available() else None, None)),
|
||||||
|
("rag", ("RagTokenizer", None)),
|
||||||
|
("xlm-prophetnet", ("XLMProphetNetTokenizer" if is_sentencepiece_available() else None, None)),
|
||||||
|
("speech_to_text", ("Speech2TextTokenizer" if is_sentencepiece_available() else None, None)),
|
||||||
|
("m2m_100", ("M2M100Tokenizer" if is_sentencepiece_available() else None, None)),
|
||||||
|
("prophetnet", ("ProphetNetTokenizer", None)),
|
||||||
|
("mpnet", ("MPNetTokenizer", "MPNetTokenizerFast" if is_tokenizers_available() else None)),
|
||||||
|
("tapas", ("TapasTokenizer", None)),
|
||||||
|
("led", ("LEDTokenizer", "LEDTokenizerFast" if is_tokenizers_available() else None)),
|
||||||
|
("convbert", ("ConvBertTokenizer", "ConvBertTokenizerFast" if is_tokenizers_available() else None)),
|
||||||
(
|
(
|
||||||
"BarthezTokenizer" if is_sentencepiece_available() else None,
|
"big_bird",
|
||||||
"BarthezTokenizerFast" if is_tokenizers_available() else None,
|
(
|
||||||
|
"BigBirdTokenizer" if is_sentencepiece_available() else None,
|
||||||
|
"BigBirdTokenizerFast" if is_tokenizers_available() else None,
|
||||||
|
),
|
||||||
),
|
),
|
||||||
),
|
("ibert", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
|
||||||
(
|
("wav2vec2", ("Wav2Vec2CTCTokenizer", None)),
|
||||||
"mbart50",
|
("hubert", ("Wav2Vec2CTCTokenizer", None)),
|
||||||
|
("gpt_neo", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
|
||||||
|
("luke", ("LukeTokenizer", None)),
|
||||||
|
("bigbird_pegasus", ("PegasusTokenizer", "PegasusTokenizerFast" if is_tokenizers_available() else None)),
|
||||||
|
("canine", ("CanineTokenizer", None)),
|
||||||
|
("bertweet", ("BertweetTokenizer", None)),
|
||||||
|
("bert-japanese", ("BertJapaneseTokenizer", None)),
|
||||||
|
("byt5", ("ByT5Tokenizer", None)),
|
||||||
(
|
(
|
||||||
"MBart50Tokenizer" if is_sentencepiece_available() else None,
|
"cpm",
|
||||||
"MBart50TokenizerFast" if is_tokenizers_available() else None,
|
(
|
||||||
|
"CpmTokenizer" if is_sentencepiece_available() else None,
|
||||||
|
"CpmTokenizerFast" if is_tokenizers_available() else None,
|
||||||
|
),
|
||||||
),
|
),
|
||||||
),
|
("herbert", ("HerbertTokenizer", "HerbertTokenizerFast" if is_tokenizers_available() else None)),
|
||||||
]
|
("phobert", ("PhobertTokenizer", None)),
|
||||||
)
|
(
|
||||||
|
"barthez",
|
||||||
|
(
|
||||||
|
"BarthezTokenizer" if is_sentencepiece_available() else None,
|
||||||
|
"BarthezTokenizerFast" if is_tokenizers_available() else None,
|
||||||
|
),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"mbart50",
|
||||||
|
(
|
||||||
|
"MBart50Tokenizer" if is_sentencepiece_available() else None,
|
||||||
|
"MBart50TokenizerFast" if is_tokenizers_available() else None,
|
||||||
|
),
|
||||||
|
),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
TOKENIZER_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TOKENIZER_MAPPING_NAMES)
|
TOKENIZER_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TOKENIZER_MAPPING_NAMES)
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user