fix: switch from slow to generic tokenizer class (#15122)

This commit is contained in:
Leandro von Werra 2022-01-12 15:12:43 +01:00 committed by GitHub
parent 27b819b0e3
commit aa0135f2e0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -2,7 +2,7 @@ from datasets import load_dataset
from tqdm import tqdm
from arguments import TokenizerTrainingArguments
from transformers import GPT2Tokenizer, HfArgumentParser
from transformers import AutoTokenizer, HfArgumentParser
from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
@ -17,7 +17,7 @@ parser = HfArgumentParser(TokenizerTrainingArguments)
args = parser.parse_args()
# Base tokenizer
tokenizer = GPT2Tokenizer.from_pretrained(args.base_tokenizer)
tokenizer = AutoTokenizer.from_pretrained(args.base_tokenizer)
base_vocab = list(bytes_to_unicode().values())
# Load dataset