mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-31 02:02:21 +06:00
Fix bug in slow tokenizer conversion, make it a lot faster (#24266)
* Make conversion faster, fix None vs 0 bug * Add second sort for consistency * Update src/transformers/convert_slow_tokenizer.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> --------- Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
This commit is contained in:
parent
1609a436ec
commit
6793f0cfe0
@ -54,12 +54,15 @@ class SentencePieceExtractor:
|
||||
|
||||
# Merges
|
||||
merges = []
|
||||
for piece_l in vocab.keys():
|
||||
for piece_r in vocab.keys():
|
||||
merge = f"{piece_l}{piece_r}"
|
||||
piece_score = vocab_scores.get(merge, None)
|
||||
if piece_score:
|
||||
merges += [(piece_l, piece_r, piece_score)]
|
||||
for merge, piece_score in vocab_scores.items():
|
||||
local = []
|
||||
for index in range(1, len(merge)):
|
||||
piece_l, piece_r = merge[:index], merge[index:]
|
||||
if piece_l in vocab and piece_r in vocab:
|
||||
local.append((piece_l, piece_r, piece_score))
|
||||
local = sorted(local, key=lambda x: (vocab[x[0]], vocab[x[1]]))
|
||||
merges.extend(local)
|
||||
|
||||
merges = sorted(merges, key=lambda val: val[2], reverse=reverse)
|
||||
merges = [(val[0], val[1]) for val in merges]
|
||||
return vocab, merges
|
||||
|
Loading…
Reference in New Issue
Block a user