diff --git a/setup.py b/setup.py index 6ea9b192618..b563b11b622 100644 --- a/setup.py +++ b/setup.py @@ -181,7 +181,7 @@ _deps = [ "timeout-decorator", "tiktoken", "timm<=0.9.16", - "tokenizers>=0.19,<0.20", + "tokenizers>=0.20,<0.21", "torch", "torchaudio", "torchvision", diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py index eb75a46a6d9..21876c7f61d 100644 --- a/src/transformers/convert_slow_tokenizer.py +++ b/src/transformers/convert_slow_tokenizer.py @@ -609,33 +609,12 @@ class SpmConverter(Converter): for id, p in enumerate(proto.pieces) if p.type in [3, 4] ] - tokens_to_add = [ - AddedToken(token, normalized=False, special=special) - for id, token, special in sorted(spm_added_tokens, key=lambda x: x[0]) - ] - - if len(tokens_to_add) > 0: - # super hack: if a token.special is set, tokenizer ignores it for now so FIXME @ArthurZ - # Accumulate added tokens into batches of special/non-special tokens, because calling add_tokens() for - # individual tokens would repeatedly rebuild a trie, which can be slow. - is_last_special = None - tokens = [] - for token in tokens_to_add: - is_special = token.special - if is_last_special is None or is_last_special == is_special: - tokens.append(token) - else: - if is_last_special: - tokenizer.add_special_tokens(tokens) - else: - tokenizer.add_tokens(tokens) - tokens = [token] - is_last_special = is_special - if tokens: - if is_last_special: - tokenizer.add_special_tokens(tokens) - else: - tokenizer.add_tokens(tokens) + tokenizer.add_tokens( + [ + AddedToken(token, normalized=False, special=special) + for id, token, special in sorted(spm_added_tokens, key=lambda x: x[0]) + ] + ) return tokenizer diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index 2634a7b6b3f..6564e079033 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -86,7 +86,7 @@ deps = { "timeout-decorator": "timeout-decorator", "tiktoken": "tiktoken", "timm": "timm<=0.9.16", - "tokenizers": "tokenizers>=0.19,<0.20", + "tokenizers": "tokenizers>=0.20,<0.21", "torch": "torch", "torchaudio": "torchaudio", "torchvision": "torchvision", diff --git a/src/transformers/tokenization_utils_fast.py b/src/transformers/tokenization_utils_fast.py index 94815caf352..cec91e038dd 100644 --- a/src/transformers/tokenization_utils_fast.py +++ b/src/transformers/tokenization_utils_fast.py @@ -175,15 +175,8 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): # We call this after having initialized the backend tokenizer because we update it. super().__init__(**kwargs) - - # Set the splitting mode for special tokens for the tokenizer to be used throughout the class. self._tokenizer.encode_special_tokens = self.split_special_tokens - # The following logic will be replace with a single add_tokens once a fix is pushed to tokenizers - # allows converting a slow -> fast, non-legacy: if the `tokenizer.json` does not have all the added tokens - # uses the information stored in `added_tokens_decoder`. - # this is costly for fast tokenizers as we re-compute the regex again. But not all tokens are added tokens - # Use hash to speed up the very slow operation `token not in added_tokens_decoder`. added_tokens_decoder_hash = {hash(repr(token)) for token in self.added_tokens_decoder} tokens_to_add = [ token @@ -197,10 +190,6 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): ] if len(tokens_to_add) > 0: - # super hack: if a token.special is set, tokenizer ignores it for now so FIXME @ArthurZ - # Accumulate added tokens into batches of special/non-special tokens, because calling add_tokens() for - # individual tokens would repeatedly rebuild a trie, which can be slow. - is_last_special = None tokens = [] special_tokens = self.all_special_tokens for token in tokens_to_add: @@ -209,14 +198,13 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): if isinstance(token, AddedToken) else str(token) in special_tokens ) - if is_last_special is None or is_last_special == is_special: - tokens.append(token) + if isinstance(token, str): + token = AddedToken(token, special=is_special) else: - self._add_tokens(tokens, special_tokens=is_last_special) - tokens = [token] - is_last_special = is_special + token.special = is_special + tokens.append(token) if tokens: - self._add_tokens(tokens, special_tokens=is_last_special) + self.add_tokens(tokens) @property def is_fast(self) -> bool: @@ -849,6 +837,13 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): if special_tokens_map is not None: tokens = [special_tokens_map.get(token, token) for token in tokens] post_processor["special_tokens"][key]["tokens"] = tokens + for token in tokens: + token_id = tokenizer.token_to_id(token) + if token_id is None: + raise ValueError( + "Attempted to set a token in the post processor that does not exist in the mapping" + ) + post_processor["special_tokens"][key]["ids"] = [tokenizer.token_to_id(token) for token in tokens] for special_token in ["cls", "sep"]: @@ -857,6 +852,10 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): if special_tokens_map is not None and token in special_tokens_map: token = special_tokens_map[token] token_id = tokenizer.token_to_id(token) + if token_id is None: + raise ValueError( + "Attempted to set a token in the post processor that does not exist in the mapping" + ) post_processor[special_token] = [token, token_id] trained_tokenizer_json["post_processor"] = post_processor