Fix bug in slow tokenizer conversion, make it a lot faster (#24266)

* Make conversion faster, fix None vs 0 bug

* Add second sort for consistency

* Update src/transformers/convert_slow_tokenizer.py

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>

---------

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
This commit is contained in:
Stephan Tulkens 2023-06-15 10:41:57 +02:00 committed by GitHub
parent 1609a436ec
commit 6793f0cfe0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -54,12 +54,15 @@ class SentencePieceExtractor:
# Merges
merges = []
for piece_l in vocab.keys():
for piece_r in vocab.keys():
merge = f"{piece_l}{piece_r}"
piece_score = vocab_scores.get(merge, None)
if piece_score:
merges += [(piece_l, piece_r, piece_score)]
for merge, piece_score in vocab_scores.items():
local = []
for index in range(1, len(merge)):
piece_l, piece_r = merge[:index], merge[index:]
if piece_l in vocab and piece_r in vocab:
local.append((piece_l, piece_r, piece_score))
local = sorted(local, key=lambda x: (vocab[x[0]], vocab[x[1]]))
merges.extend(local)
merges = sorted(merges, key=lambda val: val[2], reverse=reverse)
merges = [(val[0], val[1]) for val in merges]
return vocab, merges