mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-09 07:40:07 +06:00
fix: switch from slow to generic tokenizer class (#15122)
This commit is contained in:
parent
27b819b0e3
commit
aa0135f2e0
@ -2,7 +2,7 @@ from datasets import load_dataset
|
|||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from arguments import TokenizerTrainingArguments
|
from arguments import TokenizerTrainingArguments
|
||||||
from transformers import GPT2Tokenizer, HfArgumentParser
|
from transformers import AutoTokenizer, HfArgumentParser
|
||||||
from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
|
from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
|
||||||
|
|
||||||
|
|
||||||
@ -17,7 +17,7 @@ parser = HfArgumentParser(TokenizerTrainingArguments)
|
|||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
# Base tokenizer
|
# Base tokenizer
|
||||||
tokenizer = GPT2Tokenizer.from_pretrained(args.base_tokenizer)
|
tokenizer = AutoTokenizer.from_pretrained(args.base_tokenizer)
|
||||||
base_vocab = list(bytes_to_unicode().values())
|
base_vocab = list(bytes_to_unicode().values())
|
||||||
|
|
||||||
# Load dataset
|
# Load dataset
|
||||||
|
Loading…
Reference in New Issue
Block a user