mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-04 13:20:12 +06:00
fix: switch from slow to generic tokenizer class (#15122)
This commit is contained in:
parent
27b819b0e3
commit
aa0135f2e0
@ -2,7 +2,7 @@ from datasets import load_dataset
|
||||
from tqdm import tqdm
|
||||
|
||||
from arguments import TokenizerTrainingArguments
|
||||
from transformers import GPT2Tokenizer, HfArgumentParser
|
||||
from transformers import AutoTokenizer, HfArgumentParser
|
||||
from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
|
||||
|
||||
|
||||
@ -17,7 +17,7 @@ parser = HfArgumentParser(TokenizerTrainingArguments)
|
||||
args = parser.parse_args()
|
||||
|
||||
# Base tokenizer
|
||||
tokenizer = GPT2Tokenizer.from_pretrained(args.base_tokenizer)
|
||||
tokenizer = AutoTokenizer.from_pretrained(args.base_tokenizer)
|
||||
base_vocab = list(bytes_to_unicode().values())
|
||||
|
||||
# Load dataset
|
||||
|
Loading…
Reference in New Issue
Block a user