BartTokenizerFast (#4878)

This commit is contained in:
Suraj Patil 2020-06-14 22:34:49 +05:30 committed by GitHub
parent 403d309857
commit 9208f57b16
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 11 additions and 2 deletions

View File

@ -118,7 +118,7 @@ from .pipelines import (
# Tokenizers
from .tokenization_albert import AlbertTokenizer
from .tokenization_auto import TOKENIZER_MAPPING, AutoTokenizer
from .tokenization_bart import BartTokenizer, MBartTokenizer
from .tokenization_bart import BartTokenizer, BartTokenizerFast, MBartTokenizer
from .tokenization_bert import BasicTokenizer, BertTokenizer, BertTokenizerFast, WordpieceTokenizer
from .tokenization_bert_japanese import BertJapaneseTokenizer, CharacterTokenizer, MecabTokenizer
from .tokenization_camembert import CamembertTokenizer

View File

@ -16,7 +16,7 @@
import logging
from typing import List, Optional
from .tokenization_roberta import RobertaTokenizer
from .tokenization_roberta import RobertaTokenizer, RobertaTokenizerFast
from .tokenization_utils import BatchEncoding
from .tokenization_xlm_roberta import XLMRobertaTokenizer
@ -44,6 +44,15 @@ class BartTokenizer(RobertaTokenizer):
}
class BartTokenizerFast(RobertaTokenizerFast):
# merges and vocab same as Roberta
max_model_input_sizes = {m: 1024 for m in _all_bart_models}
pretrained_vocab_files_map = {
"vocab_file": {m: vocab_url for m in _all_bart_models},
"merges_file": {m: merges_url for m in _all_bart_models},
}
_all_mbart_models = ["facebook/mbart-large-en-ro"]
SPM_URL = "https://s3.amazonaws.com/models.huggingface.co/bert/facebook/mbart-large-en-ro/sentence.bpe.model"