From 0c98f24889f4dd7ca9f35f16186b59a66add2654 Mon Sep 17 00:00:00 2001 From: kallewoof Date: Mon, 23 Jun 2025 23:32:16 +0900 Subject: [PATCH] fix: add __bool__ operator to tokenizer to avoid bloated asserts (#38899) * fix: add __bool__ operator to tokenizer to avoid bloated asserts When a user does 'assert tokenizer' to ensure that the tokenizer is not None, they inadvertently set off a rather expensive process in the '__len__()' operator. This fix adds a trivial '__bool__()' that returns True, so that a None tokenizer asserts and an actual tokenizer returns True when asserted, without calling length op. * typo --- src/transformers/tokenization_utils_fast.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/transformers/tokenization_utils_fast.py b/src/transformers/tokenization_utils_fast.py index 9249fe5435b..3fecfa0e1dd 100644 --- a/src/transformers/tokenization_utils_fast.py +++ b/src/transformers/tokenization_utils_fast.py @@ -278,6 +278,12 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): """ return {k.content: v for v, k in sorted(self.added_tokens_decoder.items(), key=lambda item: item[0])} + def __bool__(self) -> bool: + """ + Returns True, to avoid expensive `assert tokenizer` gotchas. + """ + return True + def __len__(self) -> int: """ Size of the full vocabulary with the added tokens.