Fix mask token handling (#14364)

* Fix mask token handling * Revert "Fix mask token handling" This reverts commit daaa3f5291. * Fix FNet mask token tokenization
2025-07-31 02:02:21 +06:00 · 2021-12-02 03:16:52 +08:00 · 2021-12-02 03:16:52 +08:00 · 934e2799da
commit 934e2799da
parent 4df7d05a87
2 changed files with 14 additions and 4 deletions
--- a/src/transformers/models/fnet/tokenization_fnet.py
+++ b/src/transformers/models/fnet/tokenization_fnet.py
@ -113,8 +113,13 @@ class FNetTokenizer(PreTrainedTokenizer):
        sp_model_kwargs: Optional[Dict[str, Any]] = None,
        **kwargs
    ) -> None:
-        # Mask token behave like a normal word, i.e. include the space before it
-        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
+        # Mask token behave like a normal word, i.e. include the space before it and
+        # is included in the raw text, there should be a match in a non-normalized sentence.
+        mask_token = (
+            AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False)
+            if isinstance(mask_token, str)
+            else mask_token
+        )

        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs

--- a/src/transformers/models/fnet/tokenization_fnet_fast.py
+++ b/src/transformers/models/fnet/tokenization_fnet_fast.py
@ -107,8 +107,13 @@ class FNetTokenizerFast(PreTrainedTokenizerFast):
        mask_token="[MASK]",
        **kwargs
    ):
-        # Mask token behave like a normal word, i.e. include the space before it
-        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
+        # Mask token behave like a normal word, i.e. include the space before it and
+        # is included in the raw text, there should be a match in a non-normalized sentence.
+        mask_token = (
+            AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False)
+            if isinstance(mask_token, str)
+            else mask_token
+        )

        super().__init__(
            vocab_file,