Copy tokenizer files in each of their repo (#10624)

* Move tokenizer files in each repo

* Fix mBART50 tests

* Fix mBART tests

* Fix Marian tests

* Update templates
This commit is contained in:
Sylvain Gugger 2021-03-10 11:26:23 -05:00 committed by GitHub
parent d26b37e744
commit 2295d783d5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
38 changed files with 378 additions and 317 deletions

View File

@ -20,18 +20,36 @@ from ..roberta.tokenization_roberta import RobertaTokenizer
logger = logging.get_logger(__name__)
# vocab and merges same as roberta
vocab_url = "https://huggingface.co/roberta-large/resolve/main/vocab.json"
merges_url = "https://huggingface.co/roberta-large/resolve/main/merges.txt"
_all_bart_models = [
"facebook/bart-base",
"facebook/bart-large",
"facebook/bart-large-mnli",
"facebook/bart-large-cnn",
"facebook/bart-large-xsum",
"yjernite/bart_eli5",
# This is not exhaustive: see https://huggingface.co/models?filter=bart
]
VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt"}
# See all BART models at https://huggingface.co/models?filter=bart
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"facebook/bart-base": "https://huggingface.co/facebook/bart-base/resolve/main/vocab.json",
"facebook/bart-large": "https://huggingface.co/facebook/bart-large/resolve/main/vocab.json",
"facebook/bart-large-mnli": "https://huggingface.co/facebook/bart-large-mnli/resolve/main/vocab.json",
"facebook/bart-large-cnn": "https://huggingface.co/facebook/bart-large-cnn/resolve/main/vocab.json",
"facebook/bart-large-xsum": "https://huggingface.co/facebook/bart-large-xsum/resolve/main/vocab.json",
"yjernite/bart_eli5": "https://huggingface.co/yjernite/bart_eli5/resolve/main/vocab.json",
},
"merges_file": {
"facebook/bart-base": "https://huggingface.co/facebook/bart-base/resolve/main/merges.txt",
"facebook/bart-large": "https://huggingface.co/facebook/bart-large/resolve/main/merges.txt",
"facebook/bart-large-mnli": "https://huggingface.co/facebook/bart-large-mnli/resolve/main/merges.txt",
"facebook/bart-large-cnn": "https://huggingface.co/facebook/bart-large-cnn/resolve/main/merges.txt",
"facebook/bart-large-xsum": "https://huggingface.co/facebook/bart-large-xsum/resolve/main/merges.txt",
"yjernite/bart_eli5": "https://huggingface.co/yjernite/bart_eli5/resolve/main/merges.txt",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"facebook/bart-base": 1024,
"facebook/bart-large": 1024,
"facebook/bart-large-mnli": 1024,
"facebook/bart-large-cnn": 1024,
"facebook/bart-large-xsum": 1024,
"yjernite/bart_eli5": 1024,
}
class BartTokenizer(RobertaTokenizer):
@ -42,9 +60,6 @@ class BartTokenizer(RobertaTokenizer):
:class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning the initialization
parameters and other methods.
"""
# merges and vocab same as Roberta
max_model_input_sizes = {m: 1024 for m in _all_bart_models}
pretrained_vocab_files_map = {
"vocab_file": {m: vocab_url for m in _all_bart_models},
"merges_file": {m: merges_url for m in _all_bart_models},
}
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES

View File

@ -21,19 +21,44 @@ from .tokenization_bart import BartTokenizer
logger = logging.get_logger(__name__)
# vocab and merges same as roberta
vocab_url = "https://huggingface.co/roberta-large/resolve/main/vocab.json"
merges_url = "https://huggingface.co/roberta-large/resolve/main/merges.txt"
tokenizer_url = "https://huggingface.co/roberta-large/resolve/main/tokenizer.json"
_all_bart_models = [
"facebook/bart-base",
"facebook/bart-large",
"facebook/bart-large-mnli",
"facebook/bart-large-cnn",
"facebook/bart-large-xsum",
"yjernite/bart_eli5",
# This is not exhaustive: see https://huggingface.co/models?filter=bart
]
VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
# See all BART models at https://huggingface.co/models?filter=bart
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"facebook/bart-base": "https://huggingface.co/facebook/bart-base/resolve/main/vocab.json",
"facebook/bart-large": "https://huggingface.co/facebook/bart-large/resolve/main/vocab.json",
"facebook/bart-large-mnli": "https://huggingface.co/facebook/bart-large-mnli/resolve/main/vocab.json",
"facebook/bart-large-cnn": "https://huggingface.co/facebook/bart-large-cnn/resolve/main/vocab.json",
"facebook/bart-large-xsum": "https://huggingface.co/facebook/bart-large-xsum/resolve/main/vocab.json",
"yjernite/bart_eli5": "https://huggingface.co/yjernite/bart_eli5/resolve/main/vocab.json",
},
"merges_file": {
"facebook/bart-base": "https://huggingface.co/facebook/bart-base/resolve/main/merges.txt",
"facebook/bart-large": "https://huggingface.co/facebook/bart-large/resolve/main/merges.txt",
"facebook/bart-large-mnli": "https://huggingface.co/facebook/bart-large-mnli/resolve/main/merges.txt",
"facebook/bart-large-cnn": "https://huggingface.co/facebook/bart-large-cnn/resolve/main/merges.txt",
"facebook/bart-large-xsum": "https://huggingface.co/facebook/bart-large-xsum/resolve/main/merges.txt",
"yjernite/bart_eli5": "https://huggingface.co/yjernite/bart_eli5/resolve/main/merges.txt",
},
"tokenizer_file": {
"facebook/bart-base": "https://huggingface.co/facebook/bart-base/resolve/main/tokenizer.json",
"facebook/bart-large": "https://huggingface.co/facebook/bart-large/resolve/main/tokenizer.json",
"facebook/bart-large-mnli": "https://huggingface.co/facebook/bart-large-mnli/resolve/main/tokenizer.json",
"facebook/bart-large-cnn": "https://huggingface.co/facebook/bart-large-cnn/resolve/main/tokenizer.json",
"facebook/bart-large-xsum": "https://huggingface.co/facebook/bart-large-xsum/resolve/main/tokenizer.json",
"yjernite/bart_eli5": "https://huggingface.co/yjernite/bart_eli5/resolve/main/tokenizer.json",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"facebook/bart-base": 1024,
"facebook/bart-large": 1024,
"facebook/bart-large-mnli": 1024,
"facebook/bart-large-cnn": 1024,
"facebook/bart-large-xsum": 1024,
"yjernite/bart_eli5": 1024,
}
class BartTokenizerFast(RobertaTokenizerFast):
@ -44,11 +69,7 @@ class BartTokenizerFast(RobertaTokenizerFast):
superclass :class:`~transformers.RobertaTokenizerFast` for usage examples and documentation concerning the
initialization parameters and other methods.
"""
# merges and vocab same as Roberta
max_model_input_sizes = {m: 1024 for m in _all_bart_models}
pretrained_vocab_files_map = {
"vocab_file": {m: vocab_url for m in _all_bart_models},
"merges_file": {m: merges_url for m in _all_bart_models},
"tokenizer_file": {m: tokenizer_url for m in _all_bart_models},
}
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
slow_tokenizer_class = BartTokenizer

View File

@ -29,7 +29,13 @@ logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
tokenizer_url = "https://huggingface.co/google/bert_for_seq_generation_L-24_bbc_encoder/resolve/main/spiece.model"
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"bert_for_seq_generation": "https://huggingface.co/google/bert_for_seq_generation_L-24_bbc_encoder/resolve/main/spiece.model",
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"bert_for_seq_generation": 512}
class BertGenerationTokenizer(PreTrainedTokenizer):
@ -55,8 +61,8 @@ class BertGenerationTokenizer(PreTrainedTokenizer):
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = {"vocab_file": {"bert_for_seq_generation": tokenizer_url}}
max_model_input_sizes = {"bert_for_seq_generation": 512}
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
prefix_tokens: List[int] = []
model_input_names = ["input_ids", "attention_mask"]

View File

@ -29,9 +29,18 @@ logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {
"vocab_file": "vocab.json",
"merges_file": "merges.txt",
# "tokenizer_config_file": "tokenizer_config.json",
"tokenizer_config_file": "tokenizer_config.json",
}
CKPT_3B = "facebook/blenderbot-3B"
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {"facebook/blenderbot-3B": "https://huggingface.co/facebook/blenderbot-3B/resolve/main/vocab.json"},
"merges_file": {"facebook/blenderbot-3B": "https://huggingface.co/facebook/blenderbot-3B/resolve/main/merges.txt"},
"tokenizer_config_file": {
"facebook/blenderbot-3B": "https://huggingface.co/facebook/blenderbot-3B/resolve/main/tokenizer_config.json"
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"facebook/blenderbot-3B": 128}
class BlenderbotTokenizer(RobertaTokenizer):
@ -45,19 +54,9 @@ class BlenderbotTokenizer(RobertaTokenizer):
Refer to superclass :class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning
parameters.
"""
vocab_files_names = {
"vocab_file": "vocab.json",
"merges_file": "merges.txt",
"tokenizer_config_file": "tokenizer_config.json",
}
pretrained_vocab_files_map = {
"vocab_file": {CKPT_3B: "https://huggingface.co/facebook/blenderbot-3B/resolve/main/vocab.json"},
"merges_file": {CKPT_3B: "https://huggingface.co/facebook/blenderbot-3B/resolve/main/merges.txt"},
"tokenizer_config_file": {
CKPT_3B: "https://huggingface.co/facebook/blenderbot-3B/resolve/main/tokenizer_config.json"
},
}
max_model_input_sizes = {"facebook/blenderbot-3B": 128}
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def build_inputs_with_special_tokens(self, token_ids_0: List[int], token_ids_1: List[int] = None):
"""

View File

@ -33,6 +33,20 @@ VOCAB_FILES_NAMES = {
"tokenizer_config_file": "tokenizer_config.json",
}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"facebook/blenderbot_small-90M": "https://huggingface.co/facebook/blenderbot_small-90M/resolve/main/vocab.json"
},
"merges_file": {
"facebook/blenderbot_small-90M": "https://huggingface.co/facebook/blenderbot_small-90M/resolve/main/merges.txt"
},
"tokenizer_config_file": {
"facebook/blenderbot_small-90M": "https://huggingface.co/facebook/blenderbot_small-90M/resolve/main/tokenizer_config.json"
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"facebook/blenderbot_small-90M": 512}
def get_pairs(word):
"""
@ -75,23 +89,9 @@ class BlenderbotSmallTokenizer(PreTrainedTokenizer):
Additional keyword arguments passed along to :class:`~transformers.PreTrainedTokenizer`
"""
vocab_files_names = {
"vocab_file": "vocab.json",
"merges_file": "merges.txt",
"tokenizer_config": "tokenizer_config.json",
}
pretrained_vocab_files_map = {
"vocab_file": {
"facebook/blenderbot_small-90M": "https://huggingface.co/facebook/blenderbot_small-90M/resolve/main/vocab.json"
},
"merges_file": {
"facebook/blenderbot_small-90M": "https://huggingface.co/facebook/blenderbot_small-90M/resolve/main/merges.txt"
},
"tokenizer_config_file": {
"facebook/blenderbot_small-90M": "https://huggingface.co/facebook/blenderbot_small-90M/resolve/main/tokenizer.json"
},
}
max_model_input_sizes = {"facebook/blenderbot_small-90M": 512}
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["input_ids", "attention_mask"]
def __init__(

View File

@ -24,9 +24,23 @@ from .tokenization_blenderbot_small import BlenderbotSmallTokenizer
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {}
VOCAB_FILES_NAMES = {
"vocab_file": "vocab.json",
"merges_file": "merges.txt",
"tokenizer_config_file": "tokenizer_config.json",
}
PRETRAINED_VOCAB_FILES_MAP = {}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"facebook/blenderbot_small-90M": "https://huggingface.co/facebook/blenderbot_small-90M/resolve/main/vocab.json"
},
"merges_file": {
"facebook/blenderbot_small-90M": "https://huggingface.co/facebook/blenderbot_small-90M/resolve/main/merges.txt"
},
"tokenizer_config_file": {
"facebook/blenderbot_small-90M": "https://huggingface.co/facebook/blenderbot_small-90M/resolve/main/tokenizer_config.json"
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"facebook/blenderbot_small-90M": 512,

View File

@ -39,13 +39,6 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"camembert-base": 512,
}
SHARED_MODEL_IDENTIFIERS = [
# Load with
# `tokenizer = AutoTokenizer.from_pretrained("username/pretrained_model")`
"Musixmatch/umberto-commoncrawl-cased-v1",
"Musixmatch/umberto-wikipedia-uncased-v1",
]
SPIECE_UNDERLINE = ""

View File

@ -48,13 +48,6 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"camembert-base": 512,
}
SHARED_MODEL_IDENTIFIERS = [
# Load with
# `tokenizer = AutoTokenizer.from_pretrained("username/pretrained_model")`
"Musixmatch/umberto-commoncrawl-cased-v1",
"Musixmatch/umberto-wikipedia-uncased-v1",
]
SPIECE_UNDERLINE = ""

View File

@ -24,12 +24,12 @@ VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"distilbert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt",
"distilbert-base-uncased-distilled-squad": "https://huggingface.co/bert-large-uncased/resolve/main/vocab.txt",
"distilbert-base-cased": "https://huggingface.co/bert-base-cased/resolve/main/vocab.txt",
"distilbert-base-cased-distilled-squad": "https://huggingface.co/bert-large-cased/resolve/main/vocab.txt",
"distilbert-base-uncased": "https://huggingface.co/distilbert-base-uncased/resolve/main/vocab.txt",
"distilbert-base-uncased-distilled-squad": "https://huggingface.co/distilbert-base-uncased-distilled-squad/resolve/main/vocab.txt",
"distilbert-base-cased": "https://huggingface.co/distilbert-base-cased/resolve/main/vocab.txt",
"distilbert-base-cased-distilled-squad": "https://huggingface.co/distilbert-base-cased-distilled-squad/resolve/main/vocab.txt",
"distilbert-base-german-cased": "https://huggingface.co/distilbert-base-german-cased/resolve/main/vocab.txt",
"distilbert-base-multilingual-cased": "https://huggingface.co/bert-base-multilingual-cased/resolve/main/vocab.txt",
"distilbert-base-multilingual-cased": "https://huggingface.co/distilbert-base-multilingual-cased/resolve/main/vocab.txt",
}
}

View File

@ -25,20 +25,20 @@ VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.jso
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"distilbert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt",
"distilbert-base-uncased-distilled-squad": "https://huggingface.co/bert-large-uncased/resolve/main/vocab.txt",
"distilbert-base-cased": "https://huggingface.co/bert-base-cased/resolve/main/vocab.txt",
"distilbert-base-cased-distilled-squad": "https://huggingface.co/bert-large-cased/resolve/main/vocab.txt",
"distilbert-base-uncased": "https://huggingface.co/distilbert-base-uncased/resolve/main/vocab.txt",
"distilbert-base-uncased-distilled-squad": "https://huggingface.co/distilbert-base-uncased-distilled-squad/resolve/main/vocab.txt",
"distilbert-base-cased": "https://huggingface.co/distilbert-base-cased/resolve/main/vocab.txt",
"distilbert-base-cased-distilled-squad": "https://huggingface.co/distilbert-base-cased-distilled-squad/resolve/main/vocab.txt",
"distilbert-base-german-cased": "https://huggingface.co/distilbert-base-german-cased/resolve/main/vocab.txt",
"distilbert-base-multilingual-cased": "https://huggingface.co/bert-base-multilingual-cased/resolve/main/vocab.txt",
"distilbert-base-multilingual-cased": "https://huggingface.co/distilbert-base-multilingual-cased/resolve/main/vocab.txt",
},
"tokenizer_file": {
"distilbert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json",
"distilbert-base-uncased-distilled-squad": "https://huggingface.co/bert-large-uncased/resolve/main/tokenizer.json",
"distilbert-base-cased": "https://huggingface.co/bert-base-cased/resolve/main/tokenizer.json",
"distilbert-base-cased-distilled-squad": "https://huggingface.co/bert-large-cased/resolve/main/tokenizer.json",
"distilbert-base-uncased": "https://huggingface.co/distilbert-base-uncased/resolve/main/tokenizer.json",
"distilbert-base-uncased-distilled-squad": "https://huggingface.co/distilbert-base-uncased-distilled-squad/resolve/main/tokenizer.json",
"distilbert-base-cased": "https://huggingface.co/distilbert-base-cased/resolve/main/tokenizer.json",
"distilbert-base-cased-distilled-squad": "https://huggingface.co/distilbert-base-cased-distilled-squad/resolve/main/tokenizer.json",
"distilbert-base-german-cased": "https://huggingface.co/distilbert-base-german-cased/resolve/main/tokenizer.json",
"distilbert-base-multilingual-cased": "https://huggingface.co/bert-base-multilingual-cased/resolve/main/tokenizer.json",
"distilbert-base-multilingual-cased": "https://huggingface.co/distilbert-base-multilingual-cased/resolve/main/tokenizer.json",
},
}

View File

@ -30,32 +30,32 @@ VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.jso
CONTEXT_ENCODER_PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"facebook/dpr-ctx_encoder-single-nq-base": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt",
"facebook/dpr-ctx_encoder-multiset-base": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt",
"facebook/dpr-ctx_encoder-single-nq-base": "https://huggingface.co/facebook/dpr-ctx_encoder-single-nq-base/resolve/main/vocab.txt",
"facebook/dpr-ctx_encoder-multiset-base": "https://huggingface.co/facebook/dpr-ctx_encoder-multiset-base/resolve/main/vocab.txt",
},
"tokenizer_file": {
"facebook/dpr-ctx_encoder-single-nq-base": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json",
"facebook/dpr-ctx_encoder-multiset-base": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json",
"facebook/dpr-ctx_encoder-single-nq-base": "https://huggingface.co/facebook/dpr-ctx_encoder-single-nq-base/resolve/main/tokenizer.json",
"facebook/dpr-ctx_encoder-multiset-base": "https://huggingface.co/facebook/dpr-ctx_encoder-multiset-base/resolve/main/tokenizer.json",
},
}
QUESTION_ENCODER_PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"facebook/dpr-question_encoder-single-nq-base": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt",
"facebook/dpr-question_encoder-multiset-base": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt",
"facebook/dpr-question_encoder-single-nq-base": "https://huggingface.co/facebook/dpr-question_encoder-single-nq-base/resolve/main/vocab.txt",
"facebook/dpr-question_encoder-multiset-base": "https://huggingface.co/facebook/dpr-question_encoder-multiset-base/resolve/main/vocab.txt",
},
"tokenizer_file": {
"facebook/dpr-question_encoder-single-nq-base": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json",
"facebook/dpr-question_encoder-multiset-base": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json",
"facebook/dpr-question_encoder-single-nq-base": "https://huggingface.co/facebook/dpr-question_encoder-single-nq-base/resolve/main/tokenizer.json",
"facebook/dpr-question_encoder-multiset-base": "https://huggingface.co/facebook/dpr-question_encoder-multiset-base/resolve/main/tokenizer.json",
},
}
READER_PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"facebook/dpr-reader-single-nq-base": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt",
"facebook/dpr-reader-multiset-base": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt",
"facebook/dpr-reader-single-nq-base": "https://huggingface.co/facebook/dpr-reader-single-nq-base/resolve/main/vocab.txt",
"facebook/dpr-reader-multiset-base": "https://huggingface.co/facebook/dpr-reader-multiset-base/resolve/main/vocab.txt",
},
"tokenizer_file": {
"facebook/dpr-reader-single-nq-base": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json",
"facebook/dpr-reader-multiset-base": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json",
"facebook/dpr-reader-single-nq-base": "https://huggingface.co/facebook/dpr-reader-single-nq-base/resolve/main/tokenizer.json",
"facebook/dpr-reader-multiset-base": "https://huggingface.co/facebook/dpr-reader-multiset-base/resolve/main/tokenizer.json",
},
}

View File

@ -31,32 +31,32 @@ VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.jso
CONTEXT_ENCODER_PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"facebook/dpr-ctx_encoder-single-nq-base": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt",
"facebook/dpr-ctx_encoder-multiset-base": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt",
"facebook/dpr-ctx_encoder-single-nq-base": "https://huggingface.co/facebook/dpr-ctx_encoder-single-nq-base/resolve/main/vocab.txt",
"facebook/dpr-ctx_encoder-multiset-base": "https://huggingface.co/facebook/dpr-ctx_encoder-multiset-base/resolve/main/vocab.txt",
},
"tokenizer_file": {
"facebook/dpr-ctx_encoder-single-nq-base": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json",
"facebook/dpr-ctx_encoder-multiset-base": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json",
"facebook/dpr-ctx_encoder-single-nq-base": "https://huggingface.co/facebook/dpr-ctx_encoder-single-nq-base/resolve/main/tokenizer.json",
"facebook/dpr-ctx_encoder-multiset-base": "https://huggingface.co/facebook/dpr-ctx_encoder-multiset-base/resolve/main/tokenizer.json",
},
}
QUESTION_ENCODER_PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"facebook/dpr-question_encoder-single-nq-base": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt",
"facebook/dpr-question_encoder-multiset-base": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt",
"facebook/dpr-question_encoder-single-nq-base": "https://huggingface.co/facebook/dpr-question_encoder-single-nq-base/resolve/main/vocab.txt",
"facebook/dpr-question_encoder-multiset-base": "https://huggingface.co/facebook/dpr-question_encoder-multiset-base/resolve/main/vocab.txt",
},
"tokenizer_file": {
"facebook/dpr-question_encoder-single-nq-base": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json",
"facebook/dpr-question_encoder-multiset-base": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json",
"facebook/dpr-question_encoder-single-nq-base": "https://huggingface.co/facebook/dpr-question_encoder-single-nq-base/resolve/main/tokenizer.json",
"facebook/dpr-question_encoder-multiset-base": "https://huggingface.co/facebook/dpr-question_encoder-multiset-base/resolve/main/tokenizer.json",
},
}
READER_PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"facebook/dpr-reader-single-nq-base": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt",
"facebook/dpr-reader-multiset-base": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt",
"facebook/dpr-reader-single-nq-base": "https://huggingface.co/facebook/dpr-reader-single-nq-base/resolve/main/vocab.txt",
"facebook/dpr-reader-multiset-base": "https://huggingface.co/facebook/dpr-reader-multiset-base/resolve/main/vocab.txt",
},
"tokenizer_file": {
"facebook/dpr-reader-single-nq-base": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json",
"facebook/dpr-reader-multiset-base": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json",
"facebook/dpr-reader-single-nq-base": "https://huggingface.co/facebook/dpr-reader-single-nq-base/resolve/main/tokenizer.json",
"facebook/dpr-reader-multiset-base": "https://huggingface.co/facebook/dpr-reader-multiset-base/resolve/main/tokenizer.json",
},
}

View File

@ -36,9 +36,13 @@ VOCAB_FILES_NAMES = {
}
PRETRAINED_VOCAB_FILES_MAP = {
"src_vocab_file": {"stas/tiny-wmt19-en-de": "https://cdn.huggingface.co/stas/tiny-wmt19-en-de/vocab-src.json"},
"tgt_vocab_file": {"stas/tiny-wmt19-en-de": "https://cdn.huggingface.co/stas/tiny-wmt19-en-de/vocab-tgt.json"},
"merges_file": {"stas/tiny-wmt19-en-de": "https://cdn.huggingface.co/stas/tiny-wmt19-en-de/merges.txt"},
"src_vocab_file": {
"stas/tiny-wmt19-en-de": "https://huggingface.co/stas/tiny-wmt19-en-de/resolve/main/vocab-src.json"
},
"tgt_vocab_file": {
"stas/tiny-wmt19-en-de": "https://huggingface.co/stas/tiny-wmt19-en-de/resolve/main/vocab-tgt.json"
},
"merges_file": {"stas/tiny-wmt19-en-de": "https://huggingface.co/stas/tiny-wmt19-en-de/resolve/main/merges.txt"},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"stas/tiny-wmt19-en-de": 1024}

View File

@ -17,12 +17,7 @@ from typing import List, Optional, Tuple
from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import logging
from .tokenization_herbert import (
PRETRAINED_INIT_CONFIGURATION,
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES,
PRETRAINED_VOCAB_FILES_MAP,
HerbertTokenizer,
)
from .tokenization_herbert import HerbertTokenizer
logger = logging.get_logger(__name__)
@ -32,6 +27,14 @@ VOCAB_FILES_NAMES = {
"merges_file": "merges.txt",
}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {"allegro/herbert-base-cased": "https://cdn.huggingface.co/allegro/herbert-base-cased/vocab.json"},
"merges_file": {"allegro/herbert-base-cased": "https://cdn.huggingface.co/allegro/herbert-base-cased/merges.txt"},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"allegro/herbert-base-cased": 514}
PRETRAINED_INIT_CONFIGURATION = {}
class HerbertTokenizerFast(PreTrainedTokenizerFast):
"""

View File

@ -25,8 +25,8 @@ VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"microsoft/layoutlm-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt",
"microsoft/layoutlm-large-uncased": "https://huggingface.co/bert-large-uncased/resolve/main/vocab.txt",
"microsoft/layoutlm-base-uncased": "https://huggingface.co/microsoft/layoutlm-base-uncased/resolve/main/vocab.txt",
"microsoft/layoutlm-large-uncased": "https://huggingface.co/microsoft/layoutlm-large-uncased/resolve/main/vocab.txt",
}
}

View File

@ -26,12 +26,12 @@ VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.jso
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"microsoft/layoutlm-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt",
"microsoft/layoutlm-large-uncased": "https://huggingface.co/bert-large-uncased/resolve/main/vocab.txt",
"microsoft/layoutlm-base-uncased": "https://huggingface.co/microsoft/layoutlm-base-uncased/resolve/main/vocab.txt",
"microsoft/layoutlm-large-uncased": "https://huggingface.co/microsoft/layoutlm-large-uncased/resolve/main/vocab.txt",
},
"tokenizer_file": {
"microsoft/layoutlm-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json",
"microsoft/layoutlm-large-uncased": "https://huggingface.co/bert-large-uncased/resolve/main/tokenizer.json",
"microsoft/layoutlm-base-uncased": "https://huggingface.co/microsoft/layoutlm-base-uncased/resolve/main/tokenizer.json",
"microsoft/layoutlm-large-uncased": "https://huggingface.co/microsoft/layoutlm-large-uncased/resolve/main/tokenizer.json",
},
}

View File

@ -20,17 +20,24 @@ from ..roberta.tokenization_roberta import RobertaTokenizer
logger = logging.get_logger(__name__)
# vocab and merges same as roberta
vocab_url = "https://huggingface.co/roberta-large/resolve/main/vocab.json"
merges_url = "https://huggingface.co/roberta-large/resolve/main/merges.txt"
_all_longformer_models = [
"allenai/longformer-base-4096",
"allenai/longformer-large-4096",
"allenai/longformer-large-4096-finetuned-triviaqa",
"allenai/longformer-base-4096-extra.pos.embd.only",
"allenai/longformer-large-4096-extra.pos.embd.only",
]
VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merge_file": "merges.txt"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"allenai/longformer-base-4096": "https://huggingface.co/allenai/longformer-base-4096/resolve/main/vocab.json",
"allenai/longformer-large-4096": "https://huggingface.co/allenai/longformer-large-4096/resolve/main/vocab.json",
"allenai/longformer-large-4096-finetuned-triviaqa": "https://huggingface.co/allenai/longformer-large-4096-finetuned-triviaqa/resolve/main/vocab.json",
"allenai/longformer-base-4096-extra.pos.embd.only": "https://huggingface.co/allenai/longformer-base-4096-extra.pos.embd.only/resolve/main/vocab.json",
"allenai/longformer-large-4096-extra.pos.embd.only": "https://huggingface.co/allenai/longformer-large-4096-extra.pos.embd.only/resolve/main/vocab.json",
},
"merge_file": {
"allenai/longformer-base-4096": "https://huggingface.co/allenai/longformer-base-4096/resolve/main/merges.txt",
"allenai/longformer-large-4096": "https://huggingface.co/allenai/longformer-large-4096/resolve/main/merges.txt",
"allenai/longformer-large-4096-finetuned-triviaqa": "https://huggingface.co/allenai/longformer-large-4096-finetuned-triviaqa/resolve/main/merges.txt",
"allenai/longformer-base-4096-extra.pos.embd.only": "https://huggingface.co/allenai/longformer-base-4096-extra.pos.embd.only/resolve/main/merges.txt",
"allenai/longformer-large-4096-extra.pos.embd.only": "https://huggingface.co/allenai/longformer-large-4096-extra.pos.embd.only/resolve/main/merges.txt",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"allenai/longformer-base-4096": 4096,
@ -48,9 +55,6 @@ class LongformerTokenizer(RobertaTokenizer):
:class:`~transformers.LongformerTokenizer` is identical to :class:`~transformers.RobertaTokenizer`. Refer to the
superclass for usage examples and documentation concerning parameters.
"""
# merges and vocab same as Roberta
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_vocab_files_map = {
"vocab_file": {m: vocab_url for m in _all_longformer_models},
"merges_file": {m: merges_url for m in _all_longformer_models},
}

View File

@ -21,18 +21,31 @@ from .tokenization_longformer import LongformerTokenizer
logger = logging.get_logger(__name__)
# vocab and merges same as roberta
vocab_url = "https://huggingface.co/roberta-large/resolve/main/vocab.json"
merges_url = "https://huggingface.co/roberta-large/resolve/main/merges.txt"
tokenizer_url = "https://huggingface.co/roberta-large/resolve/main/tokenizer.json"
_all_longformer_models = [
"allenai/longformer-base-4096",
"allenai/longformer-large-4096",
"allenai/longformer-large-4096-finetuned-triviaqa",
"allenai/longformer-base-4096-extra.pos.embd.only",
"allenai/longformer-large-4096-extra.pos.embd.only",
]
VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merge_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"allenai/longformer-base-4096": "https://huggingface.co/allenai/longformer-base-4096/resolve/main/vocab.json",
"allenai/longformer-large-4096": "https://huggingface.co/allenai/longformer-large-4096/resolve/main/vocab.json",
"allenai/longformer-large-4096-finetuned-triviaqa": "https://huggingface.co/allenai/longformer-large-4096-finetuned-triviaqa/resolve/main/vocab.json",
"allenai/longformer-base-4096-extra.pos.embd.only": "https://huggingface.co/allenai/longformer-base-4096-extra.pos.embd.only/resolve/main/vocab.json",
"allenai/longformer-large-4096-extra.pos.embd.only": "https://huggingface.co/allenai/longformer-large-4096-extra.pos.embd.only/resolve/main/vocab.json",
},
"merge_file": {
"allenai/longformer-base-4096": "https://huggingface.co/allenai/longformer-base-4096/resolve/main/merges.txt",
"allenai/longformer-large-4096": "https://huggingface.co/allenai/longformer-large-4096/resolve/main/merges.txt",
"allenai/longformer-large-4096-finetuned-triviaqa": "https://huggingface.co/allenai/longformer-large-4096-finetuned-triviaqa/resolve/main/merges.txt",
"allenai/longformer-base-4096-extra.pos.embd.only": "https://huggingface.co/allenai/longformer-base-4096-extra.pos.embd.only/resolve/main/merges.txt",
"allenai/longformer-large-4096-extra.pos.embd.only": "https://huggingface.co/allenai/longformer-large-4096-extra.pos.embd.only/resolve/main/merges.txt",
},
"tokenizer_file": {
"allenai/longformer-base-4096": "https://huggingface.co/allenai/longformer-base-4096/resolve/main/tokenizer.json",
"allenai/longformer-large-4096": "https://huggingface.co/allenai/longformer-large-4096/resolve/main/tokenizer.json",
"allenai/longformer-large-4096-finetuned-triviaqa": "https://huggingface.co/allenai/longformer-large-4096-finetuned-triviaqa/resolve/main/tokenizer.json",
"allenai/longformer-base-4096-extra.pos.embd.only": "https://huggingface.co/allenai/longformer-base-4096-extra.pos.embd.only/resolve/main/tokenizer.json",
"allenai/longformer-large-4096-extra.pos.embd.only": "https://huggingface.co/allenai/longformer-large-4096-extra.pos.embd.only/resolve/main/tokenizer.json",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"allenai/longformer-base-4096": 4096,
@ -51,10 +64,7 @@ class LongformerTokenizerFast(RobertaTokenizerFast):
to the superclass for usage examples and documentation concerning parameters.
"""
# merges and vocab same as Roberta
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_vocab_files_map = {
"vocab_file": {m: vocab_url for m in _all_longformer_models},
"merges_file": {m: merges_url for m in _all_longformer_models},
"tokenizer_file": {m: tokenizer_url for m in _all_longformer_models},
}
slow_tokenizer_class = LongformerTokenizer

View File

@ -16,33 +16,18 @@
from ..bert.tokenization_bert import BertTokenizer
####################################################
# Mapping from the keyword arguments names of Tokenizer `__init__`
# to file names for serializing Tokenizer instances
####################################################
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
####################################################
# Mapping from the keyword arguments names of Tokenizer `__init__`
# to pretrained vocabulary URL for all the model ids.
####################################################
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"unc-nlp/lxmert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt",
"unc-nlp/lxmert-base-uncased": "https://huggingface.co/unc-nlp/lxmert-base-uncased/resolve/main/vocab.txt",
}
}
####################################################
# Mapping from model ids to max length of inputs
####################################################
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"unc-nlp/lxmert-base-uncased": 512,
}
####################################################
# Mapping from model ids to a dictionary of additional
# keyword arguments for Tokenizer `__init__`.
# To be used for checkpoint specific configurations.
####################################################
PRETRAINED_INIT_CONFIGURATION = {
"unc-nlp/lxmert-base-uncased": {"do_lower_case": True},
}

View File

@ -17,36 +17,21 @@ from ..bert.tokenization_bert_fast import BertTokenizerFast
from .tokenization_lxmert import LxmertTokenizer
####################################################
# Mapping from the keyword arguments names of Tokenizer `__init__`
# to file names for serializing Tokenizer instances
####################################################
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
####################################################
# Mapping from the keyword arguments names of Tokenizer `__init__`
# to pretrained vocabulary URL for all the model ids.
####################################################
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"unc-nlp/lxmert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt",
"unc-nlp/lxmert-base-uncased": "https://huggingface.co/unc-nlp/lxmert-base-uncased/resolve/main/vocab.txt",
},
"tokenizer_file": {
"unc-nlp/lxmert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json",
"unc-nlp/lxmert-base-uncased": "https://huggingface.co/unc-nlp/lxmert-base-uncased/resolve/main/tokenizer.json",
},
}
####################################################
# Mapping from model ids to max length of inputs
####################################################
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"unc-nlp/lxmert-base-uncased": 512,
}
####################################################
# Mapping from model ids to a dictionary of additional
# keyword arguments for Tokenizer `__init__`.
# To be used for checkpoint specific configurations.
####################################################
PRETRAINED_INIT_CONFIGURATION = {
"unc-nlp/lxmert-base-uncased": {"do_lower_case": True},
}

View File

@ -37,17 +37,21 @@ VOCAB_FILES_NAMES = {
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"facebook/m2m100_418M": "https://huggingface.co/facebook/m2m100_418M/resolve/main/vocab.json",
"facebook/m2m100_1.2B": "https://huggingface.co/facebook/m2m100_1.2B/resolve/main/vocab.json",
},
"spm_file": {
"facebook/m2m100_418M": "https://huggingface.co/facebook/m2m100_418M/resolve/main/sentencepiece.bpe.model",
"facebook/m2m100_1.2B": "https://huggingface.co/facebook/m2m100_1.2B/resolve/main/sentencepiece.bpe.model",
},
"tokenizer_config_file": {
"facebook/m2m100_418M": "https://huggingface.co/facebook/m2m100_418M/resolve/main/tokenizer_config.json",
"facebook/m2m100_1.2B": "https://huggingface.co/facebook/m2m100_1.2B/resolve/main/tokenizer_config.json",
},
}
ALL_M2M100_MODELS = ["facebook/m2m100_418M", "facebook/m2m100_1.2B"]
SPM_URL = "https://huggingface.co/facebook/m2m100_418M/resolve/main/sentence.bpe.model"
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"facebook/m2m100_418M": 1024,
}
# fmt: off
FAIRSEQ_LANGUAGE_CODES = ["af", "am", "ar", "ast", "az", "ba", "be", "bg", "bn", "br", "bs", "ca", "ceb", "cs", "cy", "da", "de", "el", "en", "es", "et", "fa", "ff", "fi", "fr", "fy", "ga", "gd", "gl", "gu", "ha", "he", "hi", "hr", "ht", "hu", "hy", "id", "ig", "ilo", "is", "it", "ja", "jv", "ka", "kk", "km", "kn", "ko", "lb", "lg", "ln", "lo", "lt", "lv", "mg", "mk", "ml", "mn", "mr", "ms", "my", "ne", "nl", "no", "ns", "oc", "or", "pa", "pl", "ps", "pt", "ro", "ru", "sd", "si", "sk", "sl", "so", "sq", "sr", "ss", "su", "sv", "sw", "ta", "th", "tl", "tn", "tr", "uk", "ur", "uz", "vi", "wo", "xh", "yi", "yo", "zh", "zu"]
@ -96,7 +100,7 @@ class M2M100Tokenizer(PreTrainedTokenizer):
"""
vocab_files_names = VOCAB_FILES_NAMES
max_model_input_sizes = {m: 1024 for m in ALL_M2M100_MODELS}
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
model_input_names = ["input_ids", "attention_mask"]

View File

@ -25,7 +25,7 @@ import sentencepiece
from ...tokenization_utils import PreTrainedTokenizer
vocab_files_names = {
VOCAB_FILES_NAMES = {
"source_spm": "source.spm",
"target_spm": "target.spm",
"vocab": "vocab.json",
@ -33,11 +33,17 @@ vocab_files_names = {
}
PRETRAINED_VOCAB_FILES_MAP = {
"source_spm": {"Helsinki-NLP/opus-mt-en-de": "https://cdn.huggingface.co/Helsinki-NLP/opus-mt-en-de/source.spm"},
"target_spm": {"Helsinki-NLP/opus-mt-en-de": "https://cdn.huggingface.co/Helsinki-NLP/opus-mt-en-de/target.spm"},
"vocab": {"Helsinki-NLP/opus-mt-en-de": "https://cdn.huggingface.co/Helsinki-NLP/opus-mt-en-de/vocab.json"},
"source_spm": {
"Helsinki-NLP/opus-mt-en-de": "https://huggingface.co/Helsinki-NLP/opus-mt-en-de/resolve/main/source.spm"
},
"target_spm": {
"Helsinki-NLP/opus-mt-en-de": "https://huggingface.co/Helsinki-NLP/opus-mt-en-de/resolve/main/target.spm"
},
"vocab": {
"Helsinki-NLP/opus-mt-en-de": "https://huggingface.co/Helsinki-NLP/opus-mt-en-de/resolve/main/vocab.json"
},
"tokenizer_config_file": {
"Helsinki-NLP/opus-mt-en-de": "https://cdn.huggingface.co/Helsinki-NLP/opus-mt-en-de/tokenizer_config.json"
"Helsinki-NLP/opus-mt-en-de": "https://huggingface.co/Helsinki-NLP/opus-mt-en-de/resolve/main/tokenizer_config.json"
},
}
@ -91,7 +97,7 @@ class MarianTokenizer(PreTrainedTokenizer):
>>> outputs = model(**inputs) should work
"""
vocab_files_names = vocab_files_names
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES

View File

@ -23,8 +23,20 @@ from ..xlm_roberta.tokenization_xlm_roberta import XLMRobertaTokenizer
logger = logging.get_logger(__name__)
_all_mbart_models = ["facebook/mbart-large-en-ro", "facebook/mbart-large-cc25"]
SPM_URL = "https://huggingface.co/facebook/mbart-large-en-ro/resolve/main/sentence.bpe.model"
VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"facebook/mbart-large-en-ro": "https://huggingface.co/facebook/mbart-large-en-ro/resolve/main/sentencepiece.bpe.model",
"facebook/mbart-large-cc25": "https://huggingface.co/facebook/mbart-large-cc25/resolve/main/sentencepiece.bpe.model",
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"facebook/mbart-large-en-ro": 1024,
"facebook/mbart-large-cc25": 1024,
}
FAIRSEQ_LANGUAGE_CODES = [
"ar_AR",
@ -78,9 +90,9 @@ class MBartTokenizer(XLMRobertaTokenizer):
>>> inputs["labels"] = labels["input_ids"]
"""
vocab_files_names = {"vocab_file": "sentencepiece.bpe.model"}
max_model_input_sizes = {m: 1024 for m in _all_mbart_models}
pretrained_vocab_files_map = {"vocab_file": {m: SPM_URL for m in _all_mbart_models}}
vocab_files_names = VOCAB_FILES_NAMES
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
prefix_tokens: List[int] = []
suffix_tokens: List[int] = []

View File

@ -30,8 +30,15 @@ SPIECE_UNDERLINE = "▁"
VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"}
_all_mbart50_models = ["facebook/mbart-large-50-one-to-many-mmt"]
SPM_URL = "https://huggingface.co/facebook/mbart-large-50-one-to-many-mmt/resolve/main/sentencepiece.bpe.model"
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"facebook/mbart-large-50-one-to-many-mmt": "https://huggingface.co/facebook/mbart-large-50-one-to-many-mmt/resolve/main/sentencepiece.bpe.model",
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"facebook/mbart-large-50-one-to-many-mmt": 1024,
}
# fmt: off
FAIRSEQ_LANGUAGE_CODES = ["ar_AR", "cs_CZ", "de_DE", "en_XX", "es_XX", "et_EE", "fi_FI", "fr_XX", "gu_IN", "hi_IN", "it_IT", "ja_XX", "kk_KZ", "ko_KR", "lt_LT", "lv_LV", "my_MM", "ne_NP", "nl_XX", "ro_RO", "ru_RU", "si_LK", "tr_TR", "vi_VN", "zh_CN", "af_ZA", "az_AZ", "bn_IN", "fa_IR", "he_IL", "hr_HR", "id_ID", "ka_GE", "km_KH", "mk_MK", "ml_IN", "mn_MN", "mr_IN", "pl_PL", "ps_AF", "pt_XX", "sv_SE", "sw_KE", "ta_IN", "te_IN", "th_TH", "tl_XX", "uk_UA", "ur_PK", "xh_ZA", "gl_ES", "sl_SI"]
@ -83,8 +90,8 @@ class MBart50Tokenizer(PreTrainedTokenizer):
"""
vocab_files_names = VOCAB_FILES_NAMES
max_model_input_sizes = {m: 1024 for m in _all_mbart50_models}
pretrained_vocab_files_map = {"vocab_file": {m: SPM_URL for m in _all_mbart50_models}}
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
model_input_names = ["input_ids", "attention_mask"]
prefix_tokens: List[int] = []

View File

@ -36,9 +36,18 @@ logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "tokenizer_file": "tokenizer.json"}
_all_mbart50_models = ["facebook/mbart-large-50-one-to-many-mmt"]
SPM_URL = "https://huggingface.co/facebook/mbart-large-50-one-to-many-mmt/resolve/main/sentencepiece.bpe.model"
tokenizer_URL = "https://huggingface.co/facebook/mbart-large-50-one-to-many-mmt/resolve/main/tokenizer.json"
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"facebook/mbart-large-50-one-to-many-mmt": "https://huggingface.co/facebook/mbart-large-50-one-to-many-mmt/resolve/main/sentencepiece.bpe.model",
},
"tokenizer_file": {
"facebook/mbart-large-50-one-to-many-mmt": "https://huggingface.co/facebook/mbart-large-50-one-to-many-mmt/resolve/main/tokenizer.json",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"facebook/mbart-large-50-one-to-many-mmt": 1024,
}
# fmt: off
FAIRSEQ_LANGUAGE_CODES = ["ar_AR", "cs_CZ", "de_DE", "en_XX", "es_XX", "et_EE", "fi_FI", "fr_XX", "gu_IN", "hi_IN", "it_IT", "ja_XX", "kk_KZ", "ko_KR", "lt_LT", "lv_LV", "my_MM", "ne_NP", "nl_XX", "ro_RO", "ru_RU", "si_LK", "tr_TR", "vi_VN", "zh_CN", "af_ZA", "az_AZ", "bn_IN", "fa_IR", "he_IL", "hr_HR", "id_ID", "ka_GE", "km_KH", "mk_MK", "ml_IN", "mn_MN", "mr_IN", "pl_PL", "ps_AF", "pt_XX", "sv_SE", "sw_KE", "ta_IN", "te_IN", "th_TH", "tl_XX", "uk_UA", "ur_PK", "xh_ZA", "gl_ES", "sl_SI"]
@ -91,8 +100,8 @@ class MBart50TokenizerFast(PreTrainedTokenizerFast):
"""
vocab_files_names = VOCAB_FILES_NAMES
max_model_input_sizes = {m: 1024 for m in _all_mbart50_models}
pretrained_vocab_files_map = {"vocab_file": {m: SPM_URL for m in _all_mbart50_models}}
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
model_input_names = ["input_ids", "attention_mask"]
slow_tokenizer_class = MBart50Tokenizer

View File

@ -32,9 +32,24 @@ else:
logger = logging.get_logger(__name__)
_all_mbart_models = ["facebook/mbart-large-en-ro", "facebook/mbart-large-cc25"]
SPM_URL = "https://huggingface.co/facebook/mbart-large-en-ro/resolve/main/sentence.bpe.model"
tokenizer_URL = "https://huggingface.co/facebook/mbart-large-en-ro/resolve/main/tokenizer.json"
VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "tokenizer_file": "tokenizer.json"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"facebook/mbart-large-en-ro": "https://huggingface.co/facebook/mbart-large-en-ro/resolve/main/sentencepiece.bpe.model",
"facebook/mbart-large-cc25": "https://huggingface.co/facebook/mbart-large-cc25/resolve/main/sentencepiece.bpe.model",
},
"tokenizer_file": {
"facebook/mbart-large-en-ro": "https://huggingface.co/facebook/mbart-large-en-ro/resolve/main/tokenizer.json",
"facebook/mbart-large-cc25": "https://huggingface.co/facebook/mbart-large-cc25/resolve/main/tokenizer.json",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"facebook/mbart-large-en-ro": 1024,
"facebook/mbart-large-cc25": 1024,
}
FAIRSEQ_LANGUAGE_CODES = [
"ar_AR",
@ -89,9 +104,9 @@ class MBartTokenizerFast(XLMRobertaTokenizerFast):
>>> inputs["labels"] = labels["input_ids"]
"""
vocab_files_names = {"vocab_file": "sentencepiece.bpe.model"}
max_model_input_sizes = {m: 1024 for m in _all_mbart_models}
pretrained_vocab_files_map = {"vocab_file": {m: SPM_URL for m in _all_mbart_models}}
vocab_files_names = VOCAB_FILES_NAMES
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
slow_tokenizer_class = MBartTokenizer
prefix_tokens: List[int] = []

View File

@ -27,28 +27,17 @@ from ...utils import logging
logger = logging.get_logger(__name__)
SPIECE_UNDERLINE = ""
####################################################
# Mapping from the keyword arguments names of Tokenizer `__init__`
# to file names for serializing Tokenizer instances
####################################################
VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
####################################################
# Mapping from the keyword arguments names of Tokenizer `__init__`
# to pretrained vocabulary URL for all the model ids.
####################################################
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"google/reformer-crime-and-punishment": "https://huggingface.co/google/reformer-crime-and-punishment/resolve/main/spiece.model"
}
}
####################################################
# Mapping from model ids to max length of inputs
####################################################
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"google/reformer-crime-and-punishment": 524288,
}

View File

@ -32,19 +32,11 @@ else:
logger = logging.get_logger(__name__)
SPIECE_UNDERLINE = ""
####################################################
# Mapping from the keyword arguments names of Tokenizer `__init__`
# to file names for serializing Tokenizer instances
####################################################
VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"}
####################################################
# Mapping from the keyword arguments names of Tokenizer `__init__`
# to pretrained vocabulary URL for all the model ids.
####################################################
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"google/reformer-crime-and-punishment": "https://huggingface.co/google/reformer-crime-and-punishment/resolve/main/spiece.model"
@ -54,9 +46,6 @@ PRETRAINED_VOCAB_FILES_MAP = {
},
}
####################################################
# Mapping from model ids to max length of inputs
####################################################
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"google/reformer-crime-and-punishment": 524288,
}

View File

@ -24,7 +24,7 @@ VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"yjernite/retribert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt",
"yjernite/retribert-base-uncased": "https://huggingface.co/yjernite/retribert-base-uncased/resolve/main/vocab.txt",
}
}

View File

@ -25,10 +25,10 @@ VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.jso
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"yjernite/retribert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt",
"yjernite/retribert-base-uncased": "https://huggingface.co/yjernite/retribert-base-uncased/resolve/main/vocab.txt",
},
"tokenizer_file": {
"yjernite/retribert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json",
"yjernite/retribert-base-uncased": "https://huggingface.co/yjernite/retribert-base-uncased/resolve/main/tokenizer.json",
},
}

View File

@ -34,16 +34,16 @@ PRETRAINED_VOCAB_FILES_MAP = {
"roberta-large": "https://huggingface.co/roberta-large/resolve/main/vocab.json",
"roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/vocab.json",
"distilroberta-base": "https://huggingface.co/distilroberta-base/resolve/main/vocab.json",
"roberta-base-openai-detector": "https://huggingface.co/roberta-base/resolve/main/vocab.json",
"roberta-large-openai-detector": "https://huggingface.co/roberta-large/resolve/main/vocab.json",
"roberta-base-openai-detector": "https://huggingface.co/roberta-base-openai-detector/resolve/main/vocab.json",
"roberta-large-openai-detector": "https://huggingface.co/roberta-large-openai-detector/resolve/main/vocab.json",
},
"merges_file": {
"roberta-base": "https://huggingface.co/roberta-base/resolve/main/merges.txt",
"roberta-large": "https://huggingface.co/roberta-large/resolve/main/merges.txt",
"roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/merges.txt",
"distilroberta-base": "https://huggingface.co/distilroberta-base/resolve/main/merges.txt",
"roberta-base-openai-detector": "https://huggingface.co/roberta-base/resolve/main/merges.txt",
"roberta-large-openai-detector": "https://huggingface.co/roberta-large/resolve/main/merges.txt",
"roberta-base-openai-detector": "https://huggingface.co/roberta-base-openai-detector/resolve/main/merges.txt",
"roberta-large-openai-detector": "https://huggingface.co/roberta-large-openai-detector/resolve/main/merges.txt",
},
}

View File

@ -32,24 +32,24 @@ PRETRAINED_VOCAB_FILES_MAP = {
"roberta-large": "https://huggingface.co/roberta-large/resolve/main/vocab.json",
"roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/vocab.json",
"distilroberta-base": "https://huggingface.co/distilroberta-base/resolve/main/vocab.json",
"roberta-base-openai-detector": "https://huggingface.co/roberta-base/resolve/main/vocab.json",
"roberta-large-openai-detector": "https://huggingface.co/roberta-large/resolve/main/vocab.json",
"roberta-base-openai-detector": "https://huggingface.co/roberta-base-openai-detector/resolve/main/vocab.json",
"roberta-large-openai-detector": "https://huggingface.co/roberta-large-openai-detector/resolve/main/vocab.json",
},
"merges_file": {
"roberta-base": "https://huggingface.co/roberta-base/resolve/main/merges.txt",
"roberta-large": "https://huggingface.co/roberta-large/resolve/main/merges.txt",
"roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/merges.txt",
"distilroberta-base": "https://huggingface.co/distilroberta-base/resolve/main/merges.txt",
"roberta-base-openai-detector": "https://huggingface.co/roberta-base/resolve/main/merges.txt",
"roberta-large-openai-detector": "https://huggingface.co/roberta-large/resolve/main/merges.txt",
"roberta-base-openai-detector": "https://huggingface.co/roberta-base-openai-detector/resolve/main/merges.txt",
"roberta-large-openai-detector": "https://huggingface.co/roberta-large-openai-detector/resolve/main/merges.txt",
},
"tokenizer_file": {
"roberta-base": "https://huggingface.co/roberta-base/resolve/main/tokenizer.json",
"roberta-large": "https://huggingface.co/roberta-large/resolve/main/tokenizer.json",
"roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/tokenizer.json",
"distilroberta-base": "https://huggingface.co/distilroberta-base/resolve/main/tokenizer.json",
"roberta-base-openai-detector": "https://huggingface.co/roberta-base/resolve/main/tokenizer.json",
"roberta-large-openai-detector": "https://huggingface.co/roberta-large/resolve/main/tokenizer.json",
"roberta-base-openai-detector": "https://huggingface.co/roberta-base-openai-detector/resolve/main/tokenizer.json",
"roberta-large-openai-detector": "https://huggingface.co/roberta-large-openai-detector/resolve/main/tokenizer.json",
},
}

View File

@ -29,16 +29,8 @@ from ...utils import logging
logger = logging.get_logger(__name__)
####################################################
# Mapping from the keyword arguments names of Tokenizer `__init__`
# to file names for serializing Tokenizer instances
####################################################
VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
####################################################
# Mapping from the keyword arguments names of Tokenizer `__init__`
# to pretrained vocabulary URL for all the model ids.
####################################################
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"t5-small": "https://huggingface.co/t5-small/resolve/main/spiece.model",
@ -49,9 +41,6 @@ PRETRAINED_VOCAB_FILES_MAP = {
}
}
####################################################
# Mapping from model ids to max length of inputs
####################################################
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"t5-small": 512,
"t5-base": 512,

View File

@ -32,16 +32,8 @@ else:
logger = logging.get_logger(__name__)
####################################################
# Mapping from the keyword arguments names of Tokenizer `__init__`
# to file names for serializing Tokenizer instances
####################################################
VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"}
####################################################
# Mapping from the keyword arguments names of Tokenizer `__init__`
# to pretrained vocabulary URL for all the model ids.
####################################################
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"t5-small": "https://huggingface.co/t5-small/resolve/main/spiece.model",
@ -59,9 +51,6 @@ PRETRAINED_VOCAB_FILES_MAP = {
},
}
####################################################
# Mapping from model ids to max length of inputs
####################################################
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"t5-small": 512,
"t5-base": 512,

View File

@ -37,6 +37,17 @@ VOCAB_FILES_NAMES = {
"tokenizer_config_file": "tokenizer_config.json",
}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"facebook/wav2vec2-base-960h": "https://huggingface.co/facebook/wav2vec2-base-960h/resolve/main/vocab.json",
},
"tokenizer_config_file": {
"facebook/wav2vec2-base-960h": "https://huggingface.co/facebook/wav2vec2-base-960h/resolve/main/tokenizer_config.json",
},
}
# Wav2Vec2 has no max input length
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"facebook/wav2vec2-base-960h": sys.maxsize}
WAV2VEC2_KWARGS_DOCSTRING = r"""
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`False`):
@ -98,16 +109,8 @@ class Wav2Vec2CTCTokenizer(PreTrainedTokenizer):
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = {
"vocab_file": {
"facebook/wav2vec2-base-960h": "https://huggingface.co/facebook/wav2vec2-base-960h/resolve/main/vocab.json"
},
"tokenizer_config_file": {
"facebook/wav2vec2-base-960h": "https://huggingface.co/facebook/wav2vec2-base-960h/resolve/main/tokenizer.json",
},
}
# Wav2Vec2 has no max input length
max_model_input_sizes = {"facebook/wav2vec2-base-960h": sys.maxsize}
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["input_ids", "attention_mask"]
def __init__(

View File

@ -65,6 +65,8 @@ from .tokenization_{{cookiecutter.lowercase_modelname}} import {{cookiecutter.ca
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"{{cookiecutter.checkpoint_identifier}}": "https://huggingface.co/{{cookiecutter.checkpoint_identifier}}/resolve/main/vocab.json",
@ -93,6 +95,7 @@ class {{cookiecutter.camelcase_modelname}}TokenizerFast(BartTokenizerFast):
parameters.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
slow_tokenizer_class = {{cookiecutter.camelcase_modelname}}Tokenizer
@ -109,9 +112,16 @@ from .tokenization_{{cookiecutter.lowercase_modelname}} import {{cookiecutter.ca
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {}
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
PRETRAINED_VOCAB_FILES_MAP = {}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"{{cookiecutter.checkpoint_identifier}}": "https://huggingface.co/{{cookiecutter.checkpoint_identifier}}/resolve/main/vocab.txt",
},
"tokenizer_file": {
"{{cookiecutter.checkpoint_identifier}}": "https://huggingface.co/{{cookiecutter.checkpoint_identifier}}/resolve/main/tokenizer.json",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"{{cookiecutter.checkpoint_identifier}}": 1024,

View File

@ -62,6 +62,8 @@ from ..bart.tokenization_bart import BartTokenizer
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"{{cookiecutter.checkpoint_identifier}}": "https://huggingface.co/{{cookiecutter.checkpoint_identifier}}/resolve/main/vocab.json",
@ -90,6 +92,7 @@ class {{cookiecutter.camelcase_modelname}}Tokenizer(BartTokenizer):
parameters.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
@ -105,9 +108,13 @@ from ...utils import logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {}
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
PRETRAINED_VOCAB_FILES_MAP = {}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"{{cookiecutter.checkpoint_identifier}}": "https://huggingface.co/{{cookiecutter.checkpoint_identifier}}/resolve/main/vocab.txt",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"{{cookiecutter.checkpoint_identifier}}": 1024,

View File

@ -26,7 +26,7 @@ from transformers.testing_utils import require_sentencepiece
if is_sentencepiece_available():
from transformers.models.marian.tokenization_marian import save_json, vocab_files_names
from transformers.models.marian.tokenization_marian import VOCAB_FILES_NAMES, save_json
from .test_tokenization_common import TokenizerTesterMixin
@ -50,11 +50,11 @@ class MarianTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
vocab = ["</s>", "<unk>", "▁This", "▁is", "▁a", "▁t", "est", "\u0120", "<pad>"]
vocab_tokens = dict(zip(vocab, range(len(vocab))))
save_dir = Path(self.tmpdirname)
save_json(vocab_tokens, save_dir / vocab_files_names["vocab"])
save_json(mock_tokenizer_config, save_dir / vocab_files_names["tokenizer_config_file"])
if not (save_dir / vocab_files_names["source_spm"]).exists():
copyfile(SAMPLE_SP, save_dir / vocab_files_names["source_spm"])
copyfile(SAMPLE_SP, save_dir / vocab_files_names["target_spm"])
save_json(vocab_tokens, save_dir / VOCAB_FILES_NAMES["vocab"])
save_json(mock_tokenizer_config, save_dir / VOCAB_FILES_NAMES["tokenizer_config_file"])
if not (save_dir / VOCAB_FILES_NAMES["source_spm"]).exists():
copyfile(SAMPLE_SP, save_dir / VOCAB_FILES_NAMES["source_spm"])
copyfile(SAMPLE_SP, save_dir / VOCAB_FILES_NAMES["target_spm"])
tokenizer = MarianTokenizer.from_pretrained(self.tmpdirname)
tokenizer.save_pretrained(self.tmpdirname)