mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-31 02:02:21 +06:00
Improve PreTrainedTokenizerFast
loading time when there are many added tokens (#31404)
* use hash * use hash * update --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
parent
6e56b83453
commit
1c7c34bc64
@ -172,10 +172,12 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
|
||||
# allows converting a slow -> fast, non-legacy: if the `tokenizer.json` does not have all the added tokens
|
||||
# uses the information stored in `added_tokens_decoder`.
|
||||
# this is costly for fast tokenizers as we re-compute the regex again. But not all tokens are added tokens
|
||||
# Use hash to speed up the very slow operation `token not in added_tokens_decoder`.
|
||||
added_tokens_decoder_hash = {hash(repr(token)) for token in self.added_tokens_decoder}
|
||||
tokens_to_add = [
|
||||
token
|
||||
for index, token in sorted(added_tokens_decoder.items(), key=lambda x: x[0])
|
||||
if token not in self.added_tokens_decoder
|
||||
if hash(repr(token)) not in added_tokens_decoder_hash
|
||||
]
|
||||
encoder = list(self.added_tokens_encoder.keys()) + [str(token) for token in tokens_to_add]
|
||||
# if some of the special tokens are strings, we check if we don't already have a token
|
||||
|
Loading…
Reference in New Issue
Block a user