fix: add __bool__ operator to tokenizer to avoid bloated asserts (#38899)

* fix: add __bool__ operator to tokenizer to avoid bloated asserts When a user does 'assert tokenizer' to ensure that the tokenizer is not None, they inadvertently set off a rather expensive process in the '__len__()' operator. This fix adds a trivial '__bool__()' that returns True, so that a None tokenizer asserts and an actual tokenizer returns True when asserted, without calling length op. * typo
2025-07-02 12:20:05 +06:00 · 2025-06-23 23:32:16 +09:00 · 2025-06-23 23:32:16 +09:00 · 0c98f24889
commit 0c98f24889
parent d29482cc91
1 changed files with 6 additions and 0 deletions
--- a/src/transformers/tokenization_utils_fast.py
+++ b/src/transformers/tokenization_utils_fast.py
@ -278,6 +278,12 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
        """
        return {k.content: v for v, k in sorted(self.added_tokens_decoder.items(), key=lambda item: item[0])}

+    def __bool__(self) -> bool:
+        """
+        Returns True, to avoid expensive `assert tokenizer` gotchas.
+        """
+        return True
+
    def __len__(self) -> int:
        """
        Size of the full vocabulary with the added tokens.