bump tokenizers, fix added tokens fast (#32535)

* update based on tokenizers release

* update

* nits

* update

* revert re addition

* don't break that yet

* fmt

* revert unwanted

* update tokenizers version

* update dep table

* update

* update in conversion script as well

* some fix

* revert

* fully revert

* fix training

* remove set trace

* fixup

* update

* update
This commit is contained in:
Arthur 2024-09-25 13:47:20 +02:00 committed by GitHub
parent 5e2916bc14
commit c6379858f3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 24 additions and 46 deletions

View File

@ -181,7 +181,7 @@ _deps = [
"timeout-decorator",
"tiktoken",
"timm<=0.9.16",
"tokenizers>=0.19,<0.20",
"tokenizers>=0.20,<0.21",
"torch",
"torchaudio",
"torchvision",

View File

@ -609,33 +609,12 @@ class SpmConverter(Converter):
for id, p in enumerate(proto.pieces)
if p.type in [3, 4]
]
tokens_to_add = [
AddedToken(token, normalized=False, special=special)
for id, token, special in sorted(spm_added_tokens, key=lambda x: x[0])
]
if len(tokens_to_add) > 0:
# super hack: if a token.special is set, tokenizer ignores it for now so FIXME @ArthurZ
# Accumulate added tokens into batches of special/non-special tokens, because calling add_tokens() for
# individual tokens would repeatedly rebuild a trie, which can be slow.
is_last_special = None
tokens = []
for token in tokens_to_add:
is_special = token.special
if is_last_special is None or is_last_special == is_special:
tokens.append(token)
else:
if is_last_special:
tokenizer.add_special_tokens(tokens)
else:
tokenizer.add_tokens(tokens)
tokens = [token]
is_last_special = is_special
if tokens:
if is_last_special:
tokenizer.add_special_tokens(tokens)
else:
tokenizer.add_tokens(tokens)
tokenizer.add_tokens(
[
AddedToken(token, normalized=False, special=special)
for id, token, special in sorted(spm_added_tokens, key=lambda x: x[0])
]
)
return tokenizer

View File

@ -86,7 +86,7 @@ deps = {
"timeout-decorator": "timeout-decorator",
"tiktoken": "tiktoken",
"timm": "timm<=0.9.16",
"tokenizers": "tokenizers>=0.19,<0.20",
"tokenizers": "tokenizers>=0.20,<0.21",
"torch": "torch",
"torchaudio": "torchaudio",
"torchvision": "torchvision",

View File

@ -175,15 +175,8 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
# We call this after having initialized the backend tokenizer because we update it.
super().__init__(**kwargs)
# Set the splitting mode for special tokens for the tokenizer to be used throughout the class.
self._tokenizer.encode_special_tokens = self.split_special_tokens
# The following logic will be replace with a single add_tokens once a fix is pushed to tokenizers
# allows converting a slow -> fast, non-legacy: if the `tokenizer.json` does not have all the added tokens
# uses the information stored in `added_tokens_decoder`.
# this is costly for fast tokenizers as we re-compute the regex again. But not all tokens are added tokens
# Use hash to speed up the very slow operation `token not in added_tokens_decoder`.
added_tokens_decoder_hash = {hash(repr(token)) for token in self.added_tokens_decoder}
tokens_to_add = [
token
@ -197,10 +190,6 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
]
if len(tokens_to_add) > 0:
# super hack: if a token.special is set, tokenizer ignores it for now so FIXME @ArthurZ
# Accumulate added tokens into batches of special/non-special tokens, because calling add_tokens() for
# individual tokens would repeatedly rebuild a trie, which can be slow.
is_last_special = None
tokens = []
special_tokens = self.all_special_tokens
for token in tokens_to_add:
@ -209,14 +198,13 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
if isinstance(token, AddedToken)
else str(token) in special_tokens
)
if is_last_special is None or is_last_special == is_special:
tokens.append(token)
if isinstance(token, str):
token = AddedToken(token, special=is_special)
else:
self._add_tokens(tokens, special_tokens=is_last_special)
tokens = [token]
is_last_special = is_special
token.special = is_special
tokens.append(token)
if tokens:
self._add_tokens(tokens, special_tokens=is_last_special)
self.add_tokens(tokens)
@property
def is_fast(self) -> bool:
@ -849,6 +837,13 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
if special_tokens_map is not None:
tokens = [special_tokens_map.get(token, token) for token in tokens]
post_processor["special_tokens"][key]["tokens"] = tokens
for token in tokens:
token_id = tokenizer.token_to_id(token)
if token_id is None:
raise ValueError(
"Attempted to set a token in the post processor that does not exist in the mapping"
)
post_processor["special_tokens"][key]["ids"] = [tokenizer.token_to_id(token) for token in tokens]
for special_token in ["cls", "sep"]:
@ -857,6 +852,10 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
if special_tokens_map is not None and token in special_tokens_map:
token = special_tokens_map[token]
token_id = tokenizer.token_to_id(token)
if token_id is None:
raise ValueError(
"Attempted to set a token in the post processor that does not exist in the mapping"
)
post_processor[special_token] = [token, token_id]
trained_tokenizer_json["post_processor"] = post_processor