diff --git a/examples/research_projects/codeparrot/scripts/bpe_training.py b/examples/research_projects/codeparrot/scripts/bpe_training.py index dd211cf4acf..8a3d6ee9eec 100644 --- a/examples/research_projects/codeparrot/scripts/bpe_training.py +++ b/examples/research_projects/codeparrot/scripts/bpe_training.py @@ -2,7 +2,7 @@ from datasets import load_dataset from tqdm import tqdm from arguments import TokenizerTrainingArguments -from transformers import GPT2Tokenizer, HfArgumentParser +from transformers import AutoTokenizer, HfArgumentParser from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode @@ -17,7 +17,7 @@ parser = HfArgumentParser(TokenizerTrainingArguments) args = parser.parse_args() # Base tokenizer -tokenizer = GPT2Tokenizer.from_pretrained(args.base_tokenizer) +tokenizer = AutoTokenizer.from_pretrained(args.base_tokenizer) base_vocab = list(bytes_to_unicode().values()) # Load dataset