[cleanup] MarianTokenizer: delete unused constants (#4802)

This commit is contained in:
Sam Shleifer 2020-06-05 14:57:24 -04:00 committed by GitHub
parent acaa2e6267
commit 3723f30a18
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -7,7 +7,6 @@ from typing import Dict, List, Optional, Tuple, Union
import sentencepiece
from .file_utils import S3_BUCKET_PREFIX
from .tokenization_utils import BatchEncoding, PreTrainedTokenizer
@ -17,11 +16,6 @@ vocab_files_names = {
"vocab": "vocab.json",
"tokenizer_config_file": "tokenizer_config.json",
}
MODEL_NAMES = ("opus-mt-en-de",) # TODO(SS): delete this, the only required constant is vocab_files_names
PRETRAINED_VOCAB_FILES_MAP = {
k: {m: f"{S3_BUCKET_PREFIX}/Helsinki-NLP/{m}/{fname}" for m in MODEL_NAMES}
for k, fname in vocab_files_names.items()
}
# Example URL https://s3.amazonaws.com/models.huggingface.co/bert/Helsinki-NLP/opus-mt-en-de/vocab.json
@ -41,8 +35,6 @@ class MarianTokenizer(PreTrainedTokenizer):
"""
vocab_files_names = vocab_files_names
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = {m: 512 for m in MODEL_NAMES}
model_input_names = ["attention_mask"] # actually attention_mask, decoder_attention_mask
language_code_re = re.compile(">>.+<<") # type: re.Pattern