mirror of
https://github.com/huggingface/transformers.git
synced 2025-07-15 02:28:24 +06:00
bump tokenizers, fix added tokens fast (#32535)
* update based on tokenizers release * update * nits * update * revert re addition * don't break that yet * fmt * revert unwanted * update tokenizers version * update dep table * update * update in conversion script as well * some fix * revert * fully revert * fix training * remove set trace * fixup * update * update
This commit is contained in:
parent
5e2916bc14
commit
c6379858f3
2
setup.py
2
setup.py
@ -181,7 +181,7 @@ _deps = [
|
|||||||
"timeout-decorator",
|
"timeout-decorator",
|
||||||
"tiktoken",
|
"tiktoken",
|
||||||
"timm<=0.9.16",
|
"timm<=0.9.16",
|
||||||
"tokenizers>=0.19,<0.20",
|
"tokenizers>=0.20,<0.21",
|
||||||
"torch",
|
"torch",
|
||||||
"torchaudio",
|
"torchaudio",
|
||||||
"torchvision",
|
"torchvision",
|
||||||
|
@ -609,33 +609,12 @@ class SpmConverter(Converter):
|
|||||||
for id, p in enumerate(proto.pieces)
|
for id, p in enumerate(proto.pieces)
|
||||||
if p.type in [3, 4]
|
if p.type in [3, 4]
|
||||||
]
|
]
|
||||||
tokens_to_add = [
|
tokenizer.add_tokens(
|
||||||
AddedToken(token, normalized=False, special=special)
|
[
|
||||||
for id, token, special in sorted(spm_added_tokens, key=lambda x: x[0])
|
AddedToken(token, normalized=False, special=special)
|
||||||
]
|
for id, token, special in sorted(spm_added_tokens, key=lambda x: x[0])
|
||||||
|
]
|
||||||
if len(tokens_to_add) > 0:
|
)
|
||||||
# super hack: if a token.special is set, tokenizer ignores it for now so FIXME @ArthurZ
|
|
||||||
# Accumulate added tokens into batches of special/non-special tokens, because calling add_tokens() for
|
|
||||||
# individual tokens would repeatedly rebuild a trie, which can be slow.
|
|
||||||
is_last_special = None
|
|
||||||
tokens = []
|
|
||||||
for token in tokens_to_add:
|
|
||||||
is_special = token.special
|
|
||||||
if is_last_special is None or is_last_special == is_special:
|
|
||||||
tokens.append(token)
|
|
||||||
else:
|
|
||||||
if is_last_special:
|
|
||||||
tokenizer.add_special_tokens(tokens)
|
|
||||||
else:
|
|
||||||
tokenizer.add_tokens(tokens)
|
|
||||||
tokens = [token]
|
|
||||||
is_last_special = is_special
|
|
||||||
if tokens:
|
|
||||||
if is_last_special:
|
|
||||||
tokenizer.add_special_tokens(tokens)
|
|
||||||
else:
|
|
||||||
tokenizer.add_tokens(tokens)
|
|
||||||
|
|
||||||
return tokenizer
|
return tokenizer
|
||||||
|
|
||||||
|
@ -86,7 +86,7 @@ deps = {
|
|||||||
"timeout-decorator": "timeout-decorator",
|
"timeout-decorator": "timeout-decorator",
|
||||||
"tiktoken": "tiktoken",
|
"tiktoken": "tiktoken",
|
||||||
"timm": "timm<=0.9.16",
|
"timm": "timm<=0.9.16",
|
||||||
"tokenizers": "tokenizers>=0.19,<0.20",
|
"tokenizers": "tokenizers>=0.20,<0.21",
|
||||||
"torch": "torch",
|
"torch": "torch",
|
||||||
"torchaudio": "torchaudio",
|
"torchaudio": "torchaudio",
|
||||||
"torchvision": "torchvision",
|
"torchvision": "torchvision",
|
||||||
|
@ -175,15 +175,8 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
|
|||||||
|
|
||||||
# We call this after having initialized the backend tokenizer because we update it.
|
# We call this after having initialized the backend tokenizer because we update it.
|
||||||
super().__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
# Set the splitting mode for special tokens for the tokenizer to be used throughout the class.
|
|
||||||
self._tokenizer.encode_special_tokens = self.split_special_tokens
|
self._tokenizer.encode_special_tokens = self.split_special_tokens
|
||||||
|
|
||||||
# The following logic will be replace with a single add_tokens once a fix is pushed to tokenizers
|
|
||||||
# allows converting a slow -> fast, non-legacy: if the `tokenizer.json` does not have all the added tokens
|
|
||||||
# uses the information stored in `added_tokens_decoder`.
|
|
||||||
# this is costly for fast tokenizers as we re-compute the regex again. But not all tokens are added tokens
|
|
||||||
# Use hash to speed up the very slow operation `token not in added_tokens_decoder`.
|
|
||||||
added_tokens_decoder_hash = {hash(repr(token)) for token in self.added_tokens_decoder}
|
added_tokens_decoder_hash = {hash(repr(token)) for token in self.added_tokens_decoder}
|
||||||
tokens_to_add = [
|
tokens_to_add = [
|
||||||
token
|
token
|
||||||
@ -197,10 +190,6 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
|
|||||||
]
|
]
|
||||||
|
|
||||||
if len(tokens_to_add) > 0:
|
if len(tokens_to_add) > 0:
|
||||||
# super hack: if a token.special is set, tokenizer ignores it for now so FIXME @ArthurZ
|
|
||||||
# Accumulate added tokens into batches of special/non-special tokens, because calling add_tokens() for
|
|
||||||
# individual tokens would repeatedly rebuild a trie, which can be slow.
|
|
||||||
is_last_special = None
|
|
||||||
tokens = []
|
tokens = []
|
||||||
special_tokens = self.all_special_tokens
|
special_tokens = self.all_special_tokens
|
||||||
for token in tokens_to_add:
|
for token in tokens_to_add:
|
||||||
@ -209,14 +198,13 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
|
|||||||
if isinstance(token, AddedToken)
|
if isinstance(token, AddedToken)
|
||||||
else str(token) in special_tokens
|
else str(token) in special_tokens
|
||||||
)
|
)
|
||||||
if is_last_special is None or is_last_special == is_special:
|
if isinstance(token, str):
|
||||||
tokens.append(token)
|
token = AddedToken(token, special=is_special)
|
||||||
else:
|
else:
|
||||||
self._add_tokens(tokens, special_tokens=is_last_special)
|
token.special = is_special
|
||||||
tokens = [token]
|
tokens.append(token)
|
||||||
is_last_special = is_special
|
|
||||||
if tokens:
|
if tokens:
|
||||||
self._add_tokens(tokens, special_tokens=is_last_special)
|
self.add_tokens(tokens)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def is_fast(self) -> bool:
|
def is_fast(self) -> bool:
|
||||||
@ -849,6 +837,13 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
|
|||||||
if special_tokens_map is not None:
|
if special_tokens_map is not None:
|
||||||
tokens = [special_tokens_map.get(token, token) for token in tokens]
|
tokens = [special_tokens_map.get(token, token) for token in tokens]
|
||||||
post_processor["special_tokens"][key]["tokens"] = tokens
|
post_processor["special_tokens"][key]["tokens"] = tokens
|
||||||
|
for token in tokens:
|
||||||
|
token_id = tokenizer.token_to_id(token)
|
||||||
|
if token_id is None:
|
||||||
|
raise ValueError(
|
||||||
|
"Attempted to set a token in the post processor that does not exist in the mapping"
|
||||||
|
)
|
||||||
|
|
||||||
post_processor["special_tokens"][key]["ids"] = [tokenizer.token_to_id(token) for token in tokens]
|
post_processor["special_tokens"][key]["ids"] = [tokenizer.token_to_id(token) for token in tokens]
|
||||||
|
|
||||||
for special_token in ["cls", "sep"]:
|
for special_token in ["cls", "sep"]:
|
||||||
@ -857,6 +852,10 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
|
|||||||
if special_tokens_map is not None and token in special_tokens_map:
|
if special_tokens_map is not None and token in special_tokens_map:
|
||||||
token = special_tokens_map[token]
|
token = special_tokens_map[token]
|
||||||
token_id = tokenizer.token_to_id(token)
|
token_id = tokenizer.token_to_id(token)
|
||||||
|
if token_id is None:
|
||||||
|
raise ValueError(
|
||||||
|
"Attempted to set a token in the post processor that does not exist in the mapping"
|
||||||
|
)
|
||||||
post_processor[special_token] = [token, token_id]
|
post_processor[special_token] = [token, token_id]
|
||||||
|
|
||||||
trained_tokenizer_json["post_processor"] = post_processor
|
trained_tokenizer_json["post_processor"] = post_processor
|
||||||
|
Loading…
Reference in New Issue
Block a user